nJcx's Blog

生而不忧,死而不怖。得其时横刀天下,不得其时蓬头而行。

爬取廖雪峰博客保存为pdf


#coding='utf-8'
import urllib2
import pdfkit,re
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )

def download (url,numretries=5) :
    headers = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.15 Safari/537.36'}
    request = urllib2.Request(url ,headers=headers)
    try:
        html = urllib2.urlopen(request).read()
    except urllib2.URLError as e:
        html = None
        if numretries >0:
            if hasattr(e,'code') and 500<= e.code<=600:
                return download(url,numretries-1)
    return html

def get_url_list(url):
    response = download(url)
    soup = BeautifulSoup(response, "lxml")
    menu_tag = soup.find_all(class_="uk-nav uk-nav-side")[1]
    urls = []
    for li in menu_tag.find_all("li"):
        url = li.a.get("href")
        if not url.startswith("http"):
                url = "".join(['http://www.liaoxuefeng.com', url])
        urls.append(url)
    return urls

def parse_url_to_html(url,name='1.html'):
    response = download(url)
    soup = BeautifulSoup(response, "lxml")
    body = soup.find_all(class_="x-wiki-content")[0]
    title = soup.find('h4').get_text()
    center_tag = soup.new_tag("center")
    title_tag = soup.new_tag('h1')
    title_tag.string = title
    center_tag.insert(1, title_tag)
    body.insert(1, center_tag)

    html =str(body)
    pattern = "(<img .*?src=\")(.*?)(\")"

    def func(m):
        if not m.group(3).startswith("http"):
            rtn = "".join([m.group(1), 'http://www.liaoxuefeng.com', m.group(2), m.group(3)])
            return rtn
        else:
            return "".join([m.group(1), m.group(2), m.group(3)])

    html = re.compile(pattern).sub(func, html)
    with open(name, 'wb') as f:
        f.write(html)

def save_pdf(url):
    urls = get_url_list(url)
    htmls=[]
    print '下载ing'
    for x in urls:
        name = str(urls.index(x))+'.html'
        parse_url_to_html(x,name)
        htmls.append(name)

    options = {
        'page-size': 'Letter',
        'encoding': "UTF-8",
        'custom-header': [
            ('Accept-Encoding', 'gzip')
        ]
    }
    print '下载结束'
    pdfkit.from_file(htmls, 'first.pdf', options=options)


if __name__ == '__main__':
     save_pdf('http://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000')