-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from dataforgoodfr/feat/add_scrapper_and_package
feat: add news scrapper and packaging
- Loading branch information
Showing
9 changed files
with
1,895 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -56,3 +56,4 @@ Thumbs.db | |
notebooks/.ipynb_checkpoints | ||
.env | ||
.env | ||
.venv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,19 @@ | ||
# climateguard | ||
# climateguard | ||
|
||
# Install the repo | ||
|
||
First install the scrapper dependencies, following the tutorial on the repo: https://github.com/codelucas/newspaper?tab=readme-ov-file#get-it-now | ||
Only install the OS dependencies, as the package will be installed in a virtual env, below. | ||
|
||
Install first the uv package: | ||
|
||
```bash | ||
pip install uv | ||
``` | ||
|
||
Then install the repo: | ||
|
||
```bash | ||
uv venv | ||
uv sync | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
|
||
from pydantic import BaseModel | ||
|
||
|
||
class Claim(BaseModel): | ||
claim: str | ||
context: str | ||
analysis: str | ||
disinformation_score: str | ||
disinformation_category: str | ||
|
||
class Claims(BaseModel): | ||
claims: list[Claim] | ||
|
||
|
||
class Article(BaseModel): | ||
title: str | ||
content: str | ||
url: str | ||
date: str | ||
topic: str | ||
source: str | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
import requests | ||
from bs4 import BeautifulSoup | ||
import re | ||
from datetime import datetime | ||
import json | ||
from models import Article | ||
from newspaper import Article as NewspaperArticle | ||
from urllib.parse import urlparse | ||
|
||
class NewsScraper: | ||
def __init__(self): | ||
self.headers = { | ||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | ||
} | ||
|
||
def scrape_article(self, url): | ||
# Try NewspaperArticle first | ||
newspaper_article = NewspaperArticle(url) | ||
newspaper_article.download() | ||
newspaper_article.parse() | ||
|
||
if newspaper_article.text: | ||
return Article( | ||
title=newspaper_article.title, | ||
content=newspaper_article.text, | ||
url=url, | ||
date=str(newspaper_article.publish_date) if newspaper_article.publish_date else '', | ||
topic='', # NewspaperArticle doesn't provide a topic | ||
source=url | ||
) | ||
|
||
# If NewspaperArticle fails to extract text, use custom scrapers | ||
response = requests.get(url, headers=self.headers) | ||
soup = BeautifulSoup(response.content, 'html.parser') | ||
|
||
if 'lsm.lv' in url: | ||
return self._scrape_lsm(soup, url) | ||
elif 'delfi.lv' in url: | ||
return self._scrape_delfi(soup, url) | ||
elif 'nra.lv' in url: | ||
return self._scrape_nra(soup, url) | ||
else: | ||
raise ValueError("Unsupported website") | ||
|
||
def _scrape_lsm(self, soup, url): | ||
content = ' '.join([p.text for p in soup.find_all('p')]) | ||
title = soup.find('h1').text.strip() if soup.find('h1') else '' | ||
topic = soup.find('meta', {'property': 'article:section'})['content'] if soup.find('meta', {'property': 'article:section'}) else '' | ||
date = soup.find('meta', {'property': 'article:published_time'})['content'] if soup.find('meta', {'property': 'article:published_time'}) else '' | ||
|
||
return Article( | ||
title=title, | ||
content=content, | ||
url=url, | ||
date=date, | ||
topic=topic, | ||
source=url | ||
) | ||
|
||
def _scrape_delfi(self, soup, url): | ||
content = ' '.join([p.text for p in soup.find_all('p', class_='C-article-body__paragraph')]) | ||
title = soup.find('h1', class_='C-article-headline').text.strip() if soup.find('h1', class_='C-article-headline') else '' | ||
topic = soup.find('a', class_='C-article-info__category').text.strip() if soup.find('a', class_='C-article-info__category') else '' | ||
date = soup.find('time', class_='C-article-info__time')['datetime'] if soup.find('time', class_='C-article-info__time') else '' | ||
|
||
return Article( | ||
title=title, | ||
content=content, | ||
url=url, | ||
date=date, | ||
topic=topic, | ||
source=url | ||
) | ||
|
||
def _scrape_nra(self, soup, url): | ||
content = ' '.join([p.text for p in soup.find_all('p', class_='article-text')]) | ||
title = soup.find('h1', class_='article-title').text.strip() if soup.find('h1', class_='article-title') else '' | ||
topic = soup.find('span', class_='article-category').text.strip() if soup.find('span', class_='article-category') else '' | ||
date = soup.find('time', class_='article-date')['datetime'] if soup.find('time', class_='article-date') else '' | ||
|
||
return Article( | ||
title=title, | ||
content=content, | ||
url=url, | ||
date=date, | ||
topic=topic, | ||
source=url | ||
) | ||
|
||
# Usage example: | ||
if __name__ == "__main__": | ||
scraper = NewsScraper() | ||
urls = [ | ||
"https://www.lsm.lv/raksts/dzive--stils/vide-un-dzivnieki/03.10.2024-zinojums-lidz-gadsimta-beigam-latvija-prognozeta-krasta-linijas-atkapsanas-par-47-72-metriem.a571093/", | ||
"https://www.delfi.lv/bizness/56234200/eiropas-zinas/120042670/zinam-problemu-un-neizmantojam-risinajumus-ko-latvijas-iedzivotaji-doma-par-klimata-parmainam", | ||
"https://www.delfi.lv/bizness/56234200/eiropas-zinas/120042670/kutri-izmantojam-dzerama-udens-kranus-kapec-iedzivotajiem-trukst-pamudinajuma-dzivot-zalak", | ||
"https://nra.lv/pasaule/465572-sliktas-zinas-baltvina-cienitajiem.htm", | ||
"https://www.lsm.lv/raksts/dzive--stils/vide-un-dzivnieki/20.09.2024-par-zalaku-rigu-spriedis-piecas-sestdienas-ko-sagaida-no-pirmas-iedzivotaju-klimata-asamblejas.a569637/" | ||
] | ||
|
||
articles = [] | ||
|
||
for url in urls: | ||
article = scraper.scrape_article(url) | ||
articles.append(article) | ||
print(f"Scraped: {article.title}") | ||
print(f"Content length: {len(article.content)}") | ||
print(f"Date: {article.date}") | ||
print("---") | ||
|
||
# Save to JSON | ||
output_file = 'scraped_articles.json' | ||
with open(output_file, 'w', encoding='utf-8') as f: | ||
json.dump([article.dict() for article in articles], f, ensure_ascii=False, indent=4) | ||
|
||
print(f"\nArticles saved to {output_file}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,227 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 28, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from climateguard.news_scrapper import NewsScraper\n", | ||
"from bs4 import BeautifulSoup\n", | ||
"import requests\n", | ||
"from tqdm.autonotebook import tqdm" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"ns = NewsScraper()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"BASE_URL = \"https://eng.lsm.lv\"\n", | ||
"subpath = \"/society/environment/\"\n", | ||
"response = requests.get(BASE_URL+subpath, headers=ns.headers)\n", | ||
"soup = BeautifulSoup(response.content, 'html.parser')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"116" | ||
] | ||
}, | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"articles = soup.body.find_all('article')\n", | ||
"len(articles)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 31, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"links = []\n", | ||
"for art in articles:\n", | ||
" a = art.find('a')\n", | ||
" if a:\n", | ||
" url = a.attrs.get('href')\n", | ||
" if url:\n", | ||
" links.append(BASE_URL + url)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"Attention : cette partie a été faite avec l'ancien code de scraping d'Amine et doit sûrement être adaptée si tu veux l'utiliser." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 33, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
" 0%| | 0/115 [00:00<?, ?it/s]" | ||
] | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"100%|██████████| 115/115 [02:13<00:00, 1.16s/it]\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"scraped = []\n", | ||
"for l in tqdm(links):\n", | ||
" scraped.append(ns.scrape_article(l))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 36, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import pandas as pd" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 38, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"df = pd.DataFrame(scraped)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 49, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"'03.10.2024'" | ||
] | ||
}, | ||
"execution_count": 49, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"df.url[0].split('/')[-2].split('-')[0]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 51, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"dates = df['url'].apply(lambda s: s.split('/')[-2].split('-')[0])" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 54, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"df['date'] = pd.to_datetime(dates, format='%d.%m.%Y')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 64, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"df['num_chars'] = df.full_text.apply(len)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 74, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"count 115.000000\n", | ||
"mean 3024.626087\n", | ||
"std 3020.180439\n", | ||
"min 707.000000\n", | ||
"25% 1454.000000\n", | ||
"50% 2119.000000\n", | ||
"75% 3344.000000\n", | ||
"max 21682.000000\n", | ||
"Name: num_chars, dtype: float64" | ||
] | ||
}, | ||
"execution_count": 74, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"df.num_chars.describe()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": ".venv", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.10.12" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Oops, something went wrong.