2014年4月3日 星期四

使用 BeautifulSoap 爬新聞連結

Python2 無縫接軌雖然之前寫 Python3 寫爽爽。



from bs4 import BeautifulSoup
from urllib2 import urlopen
from sys import exit

class Seed():
    def get(self):
        return [
            'http://www.appledaily.com.tw/',
            'http://www.libertytimes.com.tw/',
        ]

class Crawler():
    def crawl(self):
        results = {}
        for url in Seed().get():
            results[url] = self.crawl_url(url)
        return results

    def crawl_url(self, url):
        results = []
        response = urlopen(url)
        html_doc = response.read()
        soup = BeautifulSoup(html_doc)
        for link in soup.find_all('a'):
            normalized_link = self.normalize(url, link.get('href'))
            if normalized_link:
                results.append(normalized_link)
        return results

    def normalize(self, url, relative_link):
        if not relative_link:
            return None
        if relative_link.startswith('http'):
            return relative_link
        if relative_link.startswith('/'):
            return url + relative_link[1:]
        return None

def main():
    crawler = Crawler()
    results = crawler.crawl()
    for url in results:
        print 'Site =>', url, '\n'.join(results[url]), '\n'


if __name__ == '__main__':
    exit(main())

沒有留言:

張貼留言