from bs4 import BeautifulSoup from urllib2 import urlopen from sys import exit class Seed(): def get(self): return [ 'http://www.appledaily.com.tw/', 'http://www.libertytimes.com.tw/', ] class Crawler(): def crawl(self): results = {} for url in Seed().get(): results[url] = self.crawl_url(url) return results def crawl_url(self, url): results = [] response = urlopen(url) html_doc = response.read() soup = BeautifulSoup(html_doc) for link in soup.find_all('a'): normalized_link = self.normalize(url, link.get('href')) if normalized_link: results.append(normalized_link) return results def normalize(self, url, relative_link): if not relative_link: return None if relative_link.startswith('http'): return relative_link if relative_link.startswith('/'): return url + relative_link[1:] return None def main(): crawler = Crawler() results = crawler.crawl() for url in results: print 'Site =>', url, '\n'.join(results[url]), '\n' if __name__ == '__main__': exit(main())
2014年4月3日 星期四
使用 BeautifulSoap 爬新聞連結
Python2 無縫接軌雖然之前寫 Python3 寫爽爽。
訂閱:
張貼留言 (Atom)
沒有留言:
張貼留言