PTT爬文物品 + line 發送
import re
import sys
import json
import requests
import io
import random
from time import sleep
from datetime import datetime
from bs4 import BeautifulSoup
requests.packages.urllib3.disable_warnings()
PttName=""
load={
'from':'/bbs/'+PttName+'/index.html',
'yes':'yes'
}
rs=requests.session()
res=rs.post('https://www.ptt.cc/ask/over18',verify=False,data=load)
FILENAME=""
def PageCount(PttName):
res=rs.get('https://www.ptt.cc/bbs/'+PttName+'/index.html',verify=False)
soup=BeautifulSoup(res.text,'html.parser')
ALLpageURL = soup.select('.btn.wide')[1]['href']
ALLpage=int(getPageNumber(ALLpageURL))+1
return ALLpage
def crawler(PttName,ParsingPage):
ALLpage=PageCount(PttName)
g_id = 0;
for number in range(ALLpage, ALLpage-int(ParsingPage),-1):
res=rs.get('https://www.ptt.cc/bbs/'+PttName+'/index'+str(number)+'.html',verify=False)
soup = BeautifulSoup(res.text,'html.parser')
for tag in soup.select('div.title'):
try:
atag=tag.find('a')
time=random.uniform(0, 1)/5
sleep(time)
if(atag):
URL=atag['href']
link='https://www.ptt.cc'+URL
g_id = g_id+1
parseGos(link,g_id)
except:
print 'error:',URL
def parseGos(link , g_id):
res=rs.get(link,verify=False)
soup = BeautifulSoup(res.text,'html.parser')
author = soup.select('.article-meta-value')[0].text
title = soup.select('.article-meta-value')[2].text
date = soup.select('.article-meta-value')[3].text
try:
targetIP=u'※ 發信站: 批踢踢實業坊'
ip = soup.find(string = re.compile(targetIP))
ip = re.search(r"[0-9]*\.[0-9]*\.[0-9]*\.[0-9]*",ip).group()
except:
ip = "ip is not find"
content = soup.find(id="main-content").text
target_content=u'※ 發信站: 批踢踢實業坊(ptt.cc),'
content = content.split(target_content)
content = content[0].split(date)
main_content = content[1].replace('\n', ' ').replace('\t', ' ')
num , g , b , n ,message = 0,0,0,0,{}
for tag in soup.select('div.push'):
num += 1
push_tag = tag.find("span", {'class': 'push-tag'}).text
push_userid = tag.find("span", {'class': 'push-userid'}).text
push_content = tag.find("span", {'class': 'push-content'}).text
push_content = push_content[1:]
push_ipdatetime = tag.find("span", {'class': 'push-ipdatetime'}).text
push_ipdatetime = remove(push_ipdatetime, '\n')
message[num]={"狀態":push_tag.encode('utf-8'),"留言者":push_userid.encode('utf-8'),
"留言內容":push_content.encode('utf-8'),"留言時間":push_ipdatetime.encode('utf-8')}
if push_tag == u'推 ':
g += 1
elif push_tag == u'噓 ':
b += 1
else:
n += 1
messageNum = {"g":g,"b":b,"n":n,"all":num}
d={ "a_ID":g_id , "b_作者":author.encode('utf-8'), "c_標題":title.encode('utf-8'), "d_日期":date.encode('utf-8'),
"e_ip":ip.encode('utf-8'), "f_網址": link.encode('utf-8'), "g_內文":main_content.encode('utf-8'), "h_推文":message,"i_推文總數":messageNum}
json_data = json.dumps(d,ensure_ascii=False,indent=4,sort_keys=True)+','
store(json_data)
def store(data):
with open(FILENAME, 'a') as f:
f.write(data)
def remove(value, deletechars):
for c in deletechars:
value = value.replace(c,'')
return value.rstrip();
def getPageNumber(content) :
startIndex = content.find('index')
endIndex = content.find('.html')
pageNumber = content[startIndex+5 : endIndex]
return pageNumber
def line_nofity(item):
url = "https://notify-api.line.me/api/notify"
token = "KXwzqEGtIp1JEkS5GjqXqRAT0D4BdQQvCNcqOa7ySfz"
headers = {"Authorization" : "Bearer "+ token}
message = item[u'b_作者'] + "\n" + item[u'c_標題'] + "\n" + item[u'd_日期'] + "\n" + item[u'f_網址'] + "\n" + item[u'g_內文'] + "\n"
payload = {"message" : message}
files = {"imageFile": open("/home/shihyu/.mybin/line_ptt/test.jpg", "rb")}
r = requests.post(url ,headers = headers ,params=payload, files=files)
if __name__ == "__main__":
PttName = str(sys.argv[1])
ParsingPage = int(sys.argv[2])
keywords = (sys.argv[3]).decode('string_escape').decode('utf-8')
FILENAME='/tmp/data-'+PttName+'-'+datetime.now().strftime('%Y-%m-%d-%H-%M-%S')+'.json'
store('[')
print 'Start parsing [',PttName,']....'
crawler(PttName,ParsingPage)
store(']')
with open(FILENAME, 'r') as f:
p = f.read()
with open(FILENAME, 'w') as f:
f.write(p[:-2]+']')
with open(FILENAME, 'r') as f:
json_data = json.load(f)
keywords = keywords.split(",")
print keywords
for item in json_data:
for k in keywords:
if k in item[u'c_標題'].lower() or k in item[u'g_內文'].lower():
line_nofity(item)
print item[u'b_作者']
print item[u'c_標題']
print item[u'd_日期']
print item[u'f_網址']
print item[u'g_內文']
print '\n'
python pttcrawler_python2.py 版面名稱 爬的頁數 搜尋關鍵字(英文單字需要小寫
)
python pttcrawler_python2.py forsale 3 "華納,"
*/1 * * * * /usr/bin/python /home/shihyu/line_ptt/pttcrawler_python2.py forsale 10 hunter 2>&1 | tee /tmp/cronrun.txt
2>&1 | tee /tmp/cronrun.txt
*/20 * * * * /usr/bin/python /home/shihyu/line_ptt/pttcrawler_python2.py forsale 3 華納 2>&1 | tee /tmp/cronrun.txt
*/20 * * * * sleep 60; /usr/bin/python /home/shihyu/line_ptt/pttcrawler_python2.py hypermall 3 Timberland 2>&1 | tee /tmp/cronrun.txt
*/20 * * * * sleep 60; /usr/bin/python /home/shihyu/line_ptt/pttcrawler_python2.py hypermall 3 hunter 2>&1 | tee /tmp/cronrun.txt
*/50 * * * * sleep 60; /usr/bin/python /home/shihyu/line_ptt/pttcrawler_python2.py Ame_Casual 3 danner 2>&1 | tee /tmp/cronrun.txt
*/50 * * * * sleep 60; /usr/bin/python /home/shihyu/line_ptt/pttcrawler_python2.py outdoorgear 3 danner 2>&1 | tee /tmp/cronrun.txt
*/50 * * * * sleep 60; /usr/bin/python /home/shihyu/line_ptt/pttcrawler_python2.py outdoorgear 3 arc 2>&1 | tee /tmp/cronrun.txt