PTTcrawler (PTT文章爬蟲)

執行環境

python2

執行方法

$ python pttcrawler.py [版名]  [抓取頁數]
  • 陽春版
# coding=UTF-8

import time
from datetime import datetime
import requests
import time
import sys
from bs4 import BeautifulSoup
requests.packages.urllib3.disable_warnings()


Board = ''
payload={
'from':'/bbs/'+ Board +'/index.html',
'yes':'yes' 
}

def getPageNumber(content) :
    startIndex = content.find('index')
    endIndex = content.find('.html')
    pageNumber = content[startIndex+5 : endIndex]
    return pageNumber

# python PttCrawler.py Gossiping 2 
if __name__ == "__main__":
    Board = str(sys.argv[1])
    ParsingPage = int(sys.argv[2])
    print 'Start parsing ['+ Board +']....'
    start_time = time.time()
    rs = requests.session()
    #八卦版18禁
    res = rs.post('https://www.ptt.cc/ask/over18',verify = False, data = payload)
    res = rs.get('https://www.ptt.cc/bbs/'+ Board +'/index.html',verify = False)
    soup = BeautifulSoup(res.text,'html.parser')
    ALLpageURL = soup.select('.btn.wide')[1]['href']
    ALLpage = int(getPageNumber(ALLpageURL)) + 1
    print 'Total pages:', ALLpage
    URLlist=[]
    fileName='PttData-'+ Board + '-' + datetime.now().strftime('%Y%m%d%H%M%S')+'.txt'

    #得到每頁的所有文章
    for index in range(ALLpage, ALLpage-int(ParsingPage), -1):
        url = 'https://www.ptt.cc/bbs/'+ Board +'/index'+ str(index) +'.html'
        res = rs.get(url, verify = False)
        soup = BeautifulSoup(res.text,'html.parser')
        UrlPer = []
        for entry in soup.select('.r-ent'):
            atag = entry.select('.title')[0].find('a') 
        if(atag != None):
               URL = atag['href']          
               UrlPer.append('https://www.ptt.cc' + URL)   
        #需要反轉,因為網頁版最下面才是最新的文章
        for URL in reversed(UrlPer):
            URLlist.append(URL)   

    strNext = u"\n\n\n\n***************下一篇***************\n\n\n\n\n";
    content = ''
    #得到每篇文章的內容
    for URL in URLlist:
        res = rs.get(URL, verify = False)
        soup = BeautifulSoup(res.text, 'html.parser')
        data = soup.select('.bbs-screen.bbs-content')[0].text
        content += (data + strNext)
        time.sleep(0.05) 

    with open(fileName,'wb') as f:
        f.write( content.encode('utf8') )  

    print u'====================END===================='
    print u'execution time:' + str(time.time() - start_time)+'s'

#coding=utf-8 
import re
import sys
import json
import requests
import io
import random
from time import sleep
from datetime import datetime
from bs4 import BeautifulSoup  
requests.packages.urllib3.disable_warnings()

PttName=""
load={
'from':'/bbs/'+PttName+'/index.html',
'yes':'yes' 
}

rs=requests.session()
res=rs.post('https://www.ptt.cc/ask/over18',verify=False,data=load)
FILENAME=""

def PageCount(PttName):
    res=rs.get('https://www.ptt.cc/bbs/'+PttName+'/index.html',verify=False)
    soup=BeautifulSoup(res.text,'html.parser')
    ALLpageURL = soup.select('.btn.wide')[1]['href']
    ALLpage=int(getPageNumber(ALLpageURL))+1
    return  ALLpage 

def crawler(PttName,ParsingPage):
        ALLpage=PageCount(PttName)
        g_id = 0;

    for number  in range(ALLpage, ALLpage-int(ParsingPage),-1):
            res=rs.get('https://www.ptt.cc/bbs/'+PttName+'/index'+str(number)+'.html',verify=False)
            soup = BeautifulSoup(res.text,'html.parser')
        for tag in soup.select('div.title'):
        try:
                    atag=tag.find('a')
                    time=random.uniform(0, 1)/5
                    #print 'time:',time
                    sleep(time)
                    if(atag):
                       URL=atag['href']   
                       link='https://www.ptt.cc'+URL
                       #print link
                   g_id = g_id+1
               parseGos(link,g_id)                     
        except:
                    print 'error:',URL

def parseGos(link , g_id):
        res=rs.get(link,verify=False)
        soup = BeautifulSoup(res.text,'html.parser')
        # author
        author  = soup.select('.article-meta-value')[0].text
        #author = soup.find("span", {'class': 'article-meta-value'}).text              
        #print 'author:',author
        # title
        title = soup.select('.article-meta-value')[2].text
        #print 'title:',title
        # date
        date = soup.select('.article-meta-value')[3].text
        #print 'date:',date
        # ip       
        try:
                targetIP=u'※ 發信站: 批踢踢實業坊'
                ip =  soup.find(string = re.compile(targetIP))
                ip = re.search(r"[0-9]*\.[0-9]*\.[0-9]*\.[0-9]*",ip).group()
        except:
                ip = "ip is not find"
        #print 'ip:',ip

        # content
        content = soup.find(id="main-content").text
        target_content=u'※ 發信站: 批踢踢實業坊(ptt.cc),'
        content = content.split(target_content)
        content = content[0].split(date)
        main_content = content[1].replace('\n', '  ').replace('\t', '  ')
        #print 'content:',main_content

        # message
        num , g , b , n ,message = 0,0,0,0,{}
        for tag in soup.select('div.push'):
                num += 1
                push_tag = tag.find("span", {'class': 'push-tag'}).text
                #print "push_tag:",push_tag
                push_userid = tag.find("span", {'class': 'push-userid'}).text       
                #print "push_userid:",push_userid
                push_content = tag.find("span", {'class': 'push-content'}).text   
                push_content = push_content[1:]
                #print "push_content:",push_content
                push_ipdatetime = tag.find("span", {'class': 'push-ipdatetime'}).text   
                push_ipdatetime = remove(push_ipdatetime, '\n')
                #print "push-ipdatetime:",push_ipdatetime 

                message[num]={"狀態":push_tag.encode('utf-8'),"留言者":push_userid.encode('utf-8'),
                              "留言內容":push_content.encode('utf-8'),"留言時間":push_ipdatetime.encode('utf-8')}
                if push_tag == u'推 ':
                        g += 1
                elif push_tag == u'噓 ':
                        b += 1
                else:
                        n += 1

        messageNum = {"g":g,"b":b,"n":n,"all":num}
        # json-data  type(d) dict

        d={ "a_ID":g_id , "b_作者":author.encode('utf-8'), "c_標題":title.encode('utf-8'), "d_日期":date.encode('utf-8'),
            "e_ip":ip.encode('utf-8'), "f_內文":main_content.encode('utf-8'), "g_推文":message,"h_推文總數":messageNum}
        json_data = json.dumps(d,ensure_ascii=False,indent=4,sort_keys=True)+','
    store(json_data) 

def store(data):
    with open(FILENAME, 'a') as f:
        f.write(data)

def remove(value, deletechars):
    for c in deletechars:
        value = value.replace(c,'')
    return value.rstrip();


def getPageNumber(content) :
    startIndex = content.find('index')
    endIndex = content.find('.html')
    pageNumber = content[startIndex+5 : endIndex]
    return pageNumber

if __name__ == "__main__":  
   PttName = str(sys.argv[1])
   ParsingPage = int(sys.argv[2])
   FILENAME='data-'+PttName+'-'+datetime.now().strftime('%Y-%m-%d-%H-%M-%S')+'.json'
   store('[') 
   print 'Start parsing [',PttName,']....'
   crawler(PttName,ParsingPage)
   store(']') 


   with open(FILENAME, 'r') as f:
        p = f.read()
   with open(FILENAME, 'w') as f:
        #f.write(p.replace(',]',']'))
        f.write(p[:-2]+']')

A crawler for web PTT

PTT文章爬蟲

特色

  • 抓取PTT 文章

輸出格式 輸出 .json,格式如下

"a_ID": 編號,
"b_作者": 作者名,
"c_標題": 標題,
"d_日期": 發文時間,
"e_ip": 發文ip,
"f_內文": 內文,
"g_推文": {
    "推文編號": {
        "狀態": 推 or 噓 or →,
        "留言內容": 留言內容,
        "留言時間": 留言時間,
        "留言者": 留言者
    }
},
"h_推文總數": {
    "all": 推文數目,
    "b": 噓數,
    "g": 推數,
    "n": →數
}

執行環境

python 2.7.3

執行方法

$ python pttcrawler.py [版名]  [抓取頁數]

範例

爬PTT Gossiping版 2頁 文章內容

$ python pttcrawler.py  Gossiping  2

假設總共有100頁,則會爬取

https://www.ptt.cc/bbs/Gossiping/index100.htmlhttps://www.ptt.cc/bbs/Gossiping/index101.html 之間的內容。


執行環境

python 3

執行方法

$ python pttcrawler.py [版名]  [抓取頁數]
#coding=utf-8
import re
import sys
import json
import requests
import time
from datetime import datetime
from bs4 import BeautifulSoup


requests.packages.urllib3.disable_warnings()

PttName, fileName = "", ""
load = {
    'from': '/bbs/' + PttName + '/index.html',
    'yes': 'yes'
}

rs = requests.session()


def getPageNumber(content):
    startIndex = content.find('index')
    endIndex = content.find('.html')
    pageNumber = content[startIndex + 5: endIndex]
    return pageNumber


def over18(board):
    res = rs.get('https://www.ptt.cc/bbs/' + board + '/index.html', verify=False)
    # 先檢查網址是否包含'over18'字串 ,如有則為18禁網站
    if (res.url.find('over18') > -1):
        print("18禁網頁")
        load = {
            'from': '/bbs/' + board + '/index.html',
            'yes': 'yes'
        }
        res = rs.post('https://www.ptt.cc/ask/over18', verify=False, data=load)
        return BeautifulSoup(res.text, 'html.parser')
    return BeautifulSoup(res.text, 'html.parser')


def crawler(url_list):
    count, g_id = 0, 0
    total = len(url_list)
    # 開始爬網頁
    while url_list:
        url = url_list.pop(0)
        res = rs.get(url, verify=False)
        soup = BeautifulSoup(res.text, 'html.parser')
        # 如網頁忙線中,則先將網頁加入 index_list 並休息1秒後再連接
        if (soup.title.text.find('Service Temporarily') > -1):
            url_list.append(url)
            # print u'error_URL:', url
            # print u'error_URL head:', soup.title.text
            time.sleep(1)
        else:
            count += 1
            # print u'OK_URL:', url
            # print u'OK_URL head:', soup.title.text
            for r_ent in soup.find_all(class_="r-ent"):
                # 先得到每篇文章的篇url
                link = r_ent.find('a')
                if (link):
                    # 確定得到url
                    URL = 'https://www.ptt.cc' + link['href']
                    g_id = g_id + 1
                    # 避免被認為攻擊網站
                    time.sleep(0.1)
                    # 開始爬文章內容
                    parseGos(URL, g_id)
            print("download: " + str(100 * count / total) + " %.")
        # 避免被認為攻擊網站
        time.sleep(0.1)


def checkformat(soup, class_tag, data, index, link):
    # 避免有些文章會被使用者自行刪除 標題列 時間  之類......
    try:
        content = soup.select(class_tag)[index].text
    except Exception as e:
        print('checkformat error URL', link)
        # print 'checkformat:',str(e)
        content = "no " + data
    return content


def parseGos(link, g_id):
    res = rs.get(link, verify=False)
    soup = BeautifulSoup(res.text, 'html.parser')

    # author 文章作者
    # author  = soup.select('.article-meta-value')[0].text
    author = checkformat(soup, '.article-meta-value', 'author', 0, link)
    # print 'author:',author

    # title 文章標題  
    # title = soup.select('.article-meta-value')[2].text
    title = checkformat(soup, '.article-meta-value', 'title', 2, link)
    # print 'title:',title

    # date 文章日期
    # date = soup.select('.article-meta-value')[3].text
    date = checkformat(soup, '.article-meta-value', 'date', 3, link)
    # print 'date:',date

    # ip 文章文章ip       
    try:
        targetIP = u'※ 發信站: 批踢踢實業坊'
        ip = soup.find(string=re.compile(targetIP))
        ip = re.search(r"[0-9]*\.[0-9]*\.[0-9]*\.[0-9]*", ip).group()
    except:
        ip = "ip is not find"
    # print 'ip:',ip

    # content  文章內文
    try:
        content = soup.find(id="main-content").text
        target_content = u'※ 發信站: 批踢踢實業坊(ptt.cc),'
        content = content.split(target_content)
        content = content[0].split(date)
        main_content = content[1].replace('\n', '  ')
        # print 'content:',main_content
    except Exception as e:
        main_content = 'main_content error'
        print('main_content error URL' + link)
        # print 'main_content error:',str(e)

    # message 推文內容
    num, g, b, n, message = 0, 0, 0, 0, {}

    for tag in soup.select('div.push'):
        try:
            # push_tag  推文標籤  推  噓  註解(→)
            push_tag = tag.find("span", {'class': 'push-tag'}).text
            # print "push_tag:",push_tag

            # push_userid 推文使用者id
            push_userid = tag.find("span", {'class': 'push-userid'}).text
            # print "push_userid:",push_userid

            # push_content 推文內容
            push_content = tag.find("span", {'class': 'push-content'}).text
            push_content = push_content[1:]
            # print "push_content:",push_content

            # push-ipdatetime 推文時間
            push_ipdatetime = tag.find("span", {'class': 'push-ipdatetime'}).text
            push_ipdatetime = push_ipdatetime.rstrip()
            # print "push-ipdatetime:",push_ipdatetime

            num += 1
            message[num] = {"狀態": push_tag, "留言者": push_userid,
                            "留言內容": push_content, "留言時間": push_ipdatetime}

            # 計算推噓文數量 g = 推 , b = 噓 , n = 註解
            if push_tag == u'推 ':
                g += 1
            elif push_tag == u'噓 ':
                b += 1
            else:
                n += 1
        except Exception as e:
            print("push error URL:" + link)
            # print "push error:",str(e)

    messageNum = {"g": g, "b": b, "n": n, "all": num}

    d = {"a_ID": g_id, "b_作者": author, "c_標題": title, "d_日期": date,
         "e_ip": ip, "f_內文": main_content, "g_推文": message, "h_推文總數": messageNum}

    # json.dumps 序列化時預設為對中文使用ascii編碼
    json_data = json.dumps(d, ensure_ascii=False, indent=4, sort_keys=True) + ','
    store(json_data)


def store(data):
    with open(fileName, 'a') as f:
        f.write(data.encode(sys.stdin.encoding, "replace").decode(sys.stdin.encoding))


if __name__ == "__main__":
    PttName, ParsingPage = str(sys.argv[1]), int(sys.argv[2])
    start_time = time.time()
    print('Start parsing ' + PttName + '....')
    fileName = 'data-' + PttName + '-' + datetime.now().strftime('%Y%m%d%H%M%S') + '.json'
    # 檢查看板是否為18禁,有些看板為18禁
    soup = over18(PttName)
    ALLpageURL = soup.select('.btn.wide')[1]['href']
    # 得到本看板全部的index數量
    ALLpage = int(getPageNumber(ALLpageURL)) + 1
    index_list = []
    for index in range(ALLpage, ALLpage - int(ParsingPage), -1):
        page_url = 'https://www.ptt.cc/bbs/' + PttName + '/index' + str(index) + '.html'
        index_list.append(page_url)

    store('[\n')
    crawler(index_list)

    # 移除最後一個  "," 號
    with open(fileName, 'r') as f:
        content = f.read()
    with open(fileName, 'w') as f:
        f.write(content[:-1] + "\n]")

    print("爬蟲結束...")
    print("execution time:" + str(time.time() - start_time) + "s")