ptt using selenium

python3

# -*- coding: utf-8 -*

from selenium import webdriver
from bs4 import BeautifulSoup
driver = webdriver.PhantomJS()
driver.get("https://www.ptt.cc/bbs/Gamesale/index.html")
soup = BeautifulSoup(driver.page_source, "lxml")
for article in soup.select('.r-list-container .r-ent .title a'):
    title = str(article.string)
    print(title)

python2

# -*- coding: utf-8 -*

from selenium import webdriver
from bs4 import BeautifulSoup
driver = webdriver.PhantomJS()
driver.get("https://www.ptt.cc/bbs/Gamesale/index.html")
soup = BeautifulSoup(driver.page_source, "lxml")
for article in soup.select('.r-list-container .r-ent .title a'):
    title = (article.string)
    print(title)

ptt 18禁

點選「我同意…」的按鈕後才可進入該版及閱讀文章於是大鳥絲瓜決定趁機玩玩看怎麼讓python點擊網頁中的按鈕

#html
<form action="/ask/over18" method="post">
    <input type="hidden" name="from" value="/bbs/gossiping/index.html">
    <button class="btn-big" type="submit" name="yes" value="yes">我同意，我已年滿十八歲<br><small>進入</small></button>
    <button class="btn-big" type="submit" name="no" value="no">未滿十八歲或不同意本條款<br><small>離開</small></button>
</form>

安裝 geckodriver

https://github.com/mozilla/geckodriver/releases
$ sudo mv ./geckodriver /usr/local/bin/
$ sudo chmod a+x /usr/local/bin/geckodriver

執行 geckodriver 查看是否能正常運行。

$ geckodriver 
1476443497207    geckodriver    INFO    Listening on 127.0.0.1:4444

# -*- coding: utf-8 -*

from selenium import webdriver
from bs4 import BeautifulSoup

driver = webdriver.Firefox()
driver.get("https://www.ptt.cc/bbs/Gossiping/index.html")
button = driver.find_element_by_class_name('btn-big')
button.click() # click 第一個抓到的 btn-big
soup = BeautifulSoup(driver.page_source)
print soup.text
driver.quit()

selenium 如何告訴PTT我已滿18

# -*- coding: utf-8 -*

from selenium import webdriver
from bs4 import BeautifulSoup
driver = webdriver.PhantomJS()
driver.get("https://www.ptt.cc/bbs/Gossiping/index25664.html")
buttons = driver.find_elements_by_css_selector("div button[value='yes']")

for button in buttons:
    button.click()

driver.get("https://www.ptt.cc/bbs/Gossiping/index25664.html")
soup = BeautifulSoup(driver.page_source, "lxml")
#print soup.prettify()

for article in soup.select('.r-list-container .r-ent .title a'):
    title = (article.string)
    print(title)

TODO

# -*- coding: utf-8 -*
import sys
from selenium import webdriver
from bs4 import BeautifulSoup

driver = webdriver.PhantomJS()
driver.get("https://www.ptt.cc/ask/over18")
buttons = driver.find_elements_by_css_selector("div button[value='yes']")
for button in buttons:
    button.click()

def getPageNumber(content) :
    startIndex = content.find('index')
    endIndex = content.find('.html')
    pageNumber = content[startIndex+5 : endIndex]
    return pageNumber

def PageCount(PttName):
    driver.get('https://www.ptt.cc/bbs/'+PttName+'/index.html')
    soup = BeautifulSoup(driver.page_source, "lxml")
    ALLpageURL = soup.select('.btn.wide')[1]['href']
    ALLpage=int(getPageNumber(ALLpageURL))+1
    return  ALLpage

def crawler(PttName,ParsingPage):
    ALLpage=PageCount(PttName)
    #print ALLpage
    for number  in range(ALLpage, ALLpage-int(ParsingPage),-1):
        driver.get('https://www.ptt.cc/bbs/'+PttName+'/index'+str(number)+'.html')
        soup = BeautifulSoup(driver.page_source, "lxml")
        #print soup.prettify()
        for article in soup.select('.r-list-container .r-ent .title a'):
            title = (article.string)
            print(title)

if __name__ == "__main__":  
   PttName = str(sys.argv[1])
   ParsingPage = int(sys.argv[2])
   print 'Start parsing [',PttName,']....'
   crawler(PttName,ParsingPage)