ptt using selenium
python3
# -*- coding: utf-8 -*
from selenium import webdriver
from bs4 import BeautifulSoup
driver = webdriver.PhantomJS()
driver.get("https://www.ptt.cc/bbs/Gamesale/index.html")
soup = BeautifulSoup(driver.page_source, "lxml")
for article in soup.select('.r-list-container .r-ent .title a'):
title = str(article.string)
print(title)
python2
# -*- coding: utf-8 -*
from selenium import webdriver
from bs4 import BeautifulSoup
driver = webdriver.PhantomJS()
driver.get("https://www.ptt.cc/bbs/Gamesale/index.html")
soup = BeautifulSoup(driver.page_source, "lxml")
for article in soup.select('.r-list-container .r-ent .title a'):
title = (article.string)
print(title)
ptt 18禁
點選 「我同意…」的按鈕後才可進入該版及閱讀文章 於是大鳥絲瓜決定趁機玩玩看怎麼讓python點擊網頁中的按鈕
#html
<form action="/ask/over18" method="post">
<input type="hidden" name="from" value="/bbs/gossiping/index.html">
<button class="btn-big" type="submit" name="yes" value="yes">我同意,我已年滿十八歲<br><small>進入</small></button>
<button class="btn-big" type="submit" name="no" value="no">未滿十八歲或不同意本條款<br><small>離開</small></button>
</form>
安裝 geckodriver
https://github.com/mozilla/geckodriver/releases
$ sudo mv ./geckodriver /usr/local/bin/
$ sudo chmod a+x /usr/local/bin/geckodriver
執行 geckodriver 查看是否能正常運行。
$ geckodriver
1476443497207 geckodriver INFO Listening on 127.0.0.1:4444
# -*- coding: utf-8 -*
from selenium import webdriver
from bs4 import BeautifulSoup
driver = webdriver.Firefox()
driver.get("https://www.ptt.cc/bbs/Gossiping/index.html")
button = driver.find_element_by_class_name('btn-big')
button.click() # click 第一個抓到的 btn-big
soup = BeautifulSoup(driver.page_source)
print soup.text
driver.quit()
selenium 如何告訴PTT我已滿18
# -*- coding: utf-8 -*
from selenium import webdriver
from bs4 import BeautifulSoup
driver = webdriver.PhantomJS()
driver.get("https://www.ptt.cc/bbs/Gossiping/index25664.html")
buttons = driver.find_elements_by_css_selector("div button[value='yes']")
for button in buttons:
button.click()
driver.get("https://www.ptt.cc/bbs/Gossiping/index25664.html")
soup = BeautifulSoup(driver.page_source, "lxml")
#print soup.prettify()
for article in soup.select('.r-list-container .r-ent .title a'):
title = (article.string)
print(title)
TODO
# -*- coding: utf-8 -*
import sys
from selenium import webdriver
from bs4 import BeautifulSoup
driver = webdriver.PhantomJS()
driver.get("https://www.ptt.cc/ask/over18")
buttons = driver.find_elements_by_css_selector("div button[value='yes']")
for button in buttons:
button.click()
def getPageNumber(content) :
startIndex = content.find('index')
endIndex = content.find('.html')
pageNumber = content[startIndex+5 : endIndex]
return pageNumber
def PageCount(PttName):
driver.get('https://www.ptt.cc/bbs/'+PttName+'/index.html')
soup = BeautifulSoup(driver.page_source, "lxml")
ALLpageURL = soup.select('.btn.wide')[1]['href']
ALLpage=int(getPageNumber(ALLpageURL))+1
return ALLpage
def crawler(PttName,ParsingPage):
ALLpage=PageCount(PttName)
#print ALLpage
for number in range(ALLpage, ALLpage-int(ParsingPage),-1):
driver.get('https://www.ptt.cc/bbs/'+PttName+'/index'+str(number)+'.html')
soup = BeautifulSoup(driver.page_source, "lxml")
#print soup.prettify()
for article in soup.select('.r-list-container .r-ent .title a'):
title = (article.string)
print(title)
if __name__ == "__main__":
PttName = str(sys.argv[1])
ParsingPage = int(sys.argv[2])
print 'Start parsing [',PttName,']....'
crawler(PttName,ParsingPage)