Skip to content

Commit

Permalink
Merge pull request #1 from dataforgoodfr/feat/add_scrapper_and_package
Browse files Browse the repository at this point in the history
feat: add news scrapper and packaging
  • Loading branch information
SaboniAmine authored Oct 4, 2024
2 parents 4cc8a36 + 6447a49 commit c2d70fc
Show file tree
Hide file tree
Showing 9 changed files with 1,895 additions and 1 deletion.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,4 @@ Thumbs.db
notebooks/.ipynb_checkpoints
.env
.env
.venv
20 changes: 19 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,19 @@
# climateguard
# climateguard

# Install the repo

First install the scrapper dependencies, following the tutorial on the repo: https://github.com/codelucas/newspaper?tab=readme-ov-file#get-it-now
Only install the OS dependencies, as the package will be installed in a virtual env, below.

Install first the uv package:

```bash
pip install uv
```

Then install the repo:

```bash
uv venv
uv sync
```
23 changes: 23 additions & 0 deletions climateguard/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@

from pydantic import BaseModel


class Claim(BaseModel):
claim: str
context: str
analysis: str
disinformation_score: str
disinformation_category: str

class Claims(BaseModel):
claims: list[Claim]


class Article(BaseModel):
title: str
content: str
url: str
date: str
topic: str
source: str

116 changes: 116 additions & 0 deletions climateguard/news_scrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime
import json
from models import Article
from newspaper import Article as NewspaperArticle
from urllib.parse import urlparse

class NewsScraper:
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def scrape_article(self, url):
# Try NewspaperArticle first
newspaper_article = NewspaperArticle(url)
newspaper_article.download()
newspaper_article.parse()

if newspaper_article.text:
return Article(
title=newspaper_article.title,
content=newspaper_article.text,
url=url,
date=str(newspaper_article.publish_date) if newspaper_article.publish_date else '',
topic='', # NewspaperArticle doesn't provide a topic
source=url
)

# If NewspaperArticle fails to extract text, use custom scrapers
response = requests.get(url, headers=self.headers)
soup = BeautifulSoup(response.content, 'html.parser')

if 'lsm.lv' in url:
return self._scrape_lsm(soup, url)
elif 'delfi.lv' in url:
return self._scrape_delfi(soup, url)
elif 'nra.lv' in url:
return self._scrape_nra(soup, url)
else:
raise ValueError("Unsupported website")

def _scrape_lsm(self, soup, url):
content = ' '.join([p.text for p in soup.find_all('p')])
title = soup.find('h1').text.strip() if soup.find('h1') else ''
topic = soup.find('meta', {'property': 'article:section'})['content'] if soup.find('meta', {'property': 'article:section'}) else ''
date = soup.find('meta', {'property': 'article:published_time'})['content'] if soup.find('meta', {'property': 'article:published_time'}) else ''

return Article(
title=title,
content=content,
url=url,
date=date,
topic=topic,
source=url
)

def _scrape_delfi(self, soup, url):
content = ' '.join([p.text for p in soup.find_all('p', class_='C-article-body__paragraph')])
title = soup.find('h1', class_='C-article-headline').text.strip() if soup.find('h1', class_='C-article-headline') else ''
topic = soup.find('a', class_='C-article-info__category').text.strip() if soup.find('a', class_='C-article-info__category') else ''
date = soup.find('time', class_='C-article-info__time')['datetime'] if soup.find('time', class_='C-article-info__time') else ''

return Article(
title=title,
content=content,
url=url,
date=date,
topic=topic,
source=url
)

def _scrape_nra(self, soup, url):
content = ' '.join([p.text for p in soup.find_all('p', class_='article-text')])
title = soup.find('h1', class_='article-title').text.strip() if soup.find('h1', class_='article-title') else ''
topic = soup.find('span', class_='article-category').text.strip() if soup.find('span', class_='article-category') else ''
date = soup.find('time', class_='article-date')['datetime'] if soup.find('time', class_='article-date') else ''

return Article(
title=title,
content=content,
url=url,
date=date,
topic=topic,
source=url
)

# Usage example:
if __name__ == "__main__":
scraper = NewsScraper()
urls = [
"https://www.lsm.lv/raksts/dzive--stils/vide-un-dzivnieki/03.10.2024-zinojums-lidz-gadsimta-beigam-latvija-prognozeta-krasta-linijas-atkapsanas-par-47-72-metriem.a571093/",
"https://www.delfi.lv/bizness/56234200/eiropas-zinas/120042670/zinam-problemu-un-neizmantojam-risinajumus-ko-latvijas-iedzivotaji-doma-par-klimata-parmainam",
"https://www.delfi.lv/bizness/56234200/eiropas-zinas/120042670/kutri-izmantojam-dzerama-udens-kranus-kapec-iedzivotajiem-trukst-pamudinajuma-dzivot-zalak",
"https://nra.lv/pasaule/465572-sliktas-zinas-baltvina-cienitajiem.htm",
"https://www.lsm.lv/raksts/dzive--stils/vide-un-dzivnieki/20.09.2024-par-zalaku-rigu-spriedis-piecas-sestdienas-ko-sagaida-no-pirmas-iedzivotaju-klimata-asamblejas.a569637/"
]

articles = []

for url in urls:
article = scraper.scrape_article(url)
articles.append(article)
print(f"Scraped: {article.title}")
print(f"Content length: {len(article.content)}")
print(f"Date: {article.date}")
print("---")

# Save to JSON
output_file = 'scraped_articles.json'
with open(output_file, 'w', encoding='utf-8') as f:
json.dump([article.dict() for article in articles], f, ensure_ascii=False, indent=4)

print(f"\nArticles saved to {output_file}")
227 changes: 227 additions & 0 deletions climateguard/scrap_LSM.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"from climateguard.news_scrapper import NewsScraper\n",
"from bs4 import BeautifulSoup\n",
"import requests\n",
"from tqdm.autonotebook import tqdm"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"ns = NewsScraper()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"BASE_URL = \"https://eng.lsm.lv\"\n",
"subpath = \"/society/environment/\"\n",
"response = requests.get(BASE_URL+subpath, headers=ns.headers)\n",
"soup = BeautifulSoup(response.content, 'html.parser')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"116"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"articles = soup.body.find_all('article')\n",
"len(articles)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"links = []\n",
"for art in articles:\n",
" a = art.find('a')\n",
" if a:\n",
" url = a.attrs.get('href')\n",
" if url:\n",
" links.append(BASE_URL + url)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Attention : cette partie a été faite avec l'ancien code de scraping d'Amine et doit sûrement être adaptée si tu veux l'utiliser."
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 0/115 [00:00<?, ?it/s]"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 115/115 [02:13<00:00, 1.16s/it]\n"
]
}
],
"source": [
"scraped = []\n",
"for l in tqdm(links):\n",
" scraped.append(ns.scrape_article(l))"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame(scraped)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'03.10.2024'"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.url[0].split('/')[-2].split('-')[0]"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
"dates = df['url'].apply(lambda s: s.split('/')[-2].split('-')[0])"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"df['date'] = pd.to_datetime(dates, format='%d.%m.%Y')"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
"df['num_chars'] = df.full_text.apply(len)"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 115.000000\n",
"mean 3024.626087\n",
"std 3020.180439\n",
"min 707.000000\n",
"25% 1454.000000\n",
"50% 2119.000000\n",
"75% 3344.000000\n",
"max 21682.000000\n",
"Name: num_chars, dtype: float64"
]
},
"execution_count": 74,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.num_chars.describe()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit c2d70fc

Please sign in to comment.