diff --git a/README.md b/README.md index f06e4fd..cec0063 100644 --- a/README.md +++ b/README.md @@ -4,5 +4,6 @@ Apple Daily HK Crawler ## Usage ```bash -python crawl.py 20150101 20150101.json +python crawl.py 20150101 20150101.json ``` + diff --git a/crawl.py b/crawl.py index 939d79f..dc5e183 100644 --- a/crawl.py +++ b/crawl.py @@ -13,7 +13,7 @@ def fetch(item): root = etree.HTML(r.text) item['title'] = root.xpath("//table[@class=\"LinkTable\"]/tr/td/h1")[0].text.strip() item['image'] = root.xpath("//meta[@property=\"og:image\"]")[0].attrib['content'].strip() - item['text'] = ''.join([s.strip() for s in root.xpath("//div[@id=\"masterContent\"]")[0].itertext()]).strip() + item['text'] = re.sub('(please only add this icon at the end of the article)','',''.join([s.strip() for s in root.xpath("//div[@id=\"masterContent\"]")[0].itertext()]).strip()) except Exception as e: print("cannot parse %s" % (item['link'])) raise