Merge pull request #1 from dataforgoodfr/feat/add_scrapper_and_package

feat: add news scrapper and packaging
dataforgoodfr · Oct 4, 2024 · c2d70fc · c2d70fc
2 parents 4cc8a36 + 6447a49
commit c2d70fc
Show file tree

Hide file tree

Showing 9 changed files with 1,895 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -56,3 +56,4 @@ Thumbs.db
 notebooks/.ipynb_checkpoints
 .env
 .env
+.venv
diff --git a/README.md b/README.md
@@ -1 +1,19 @@
-# climateguard
+# climateguard
+
+# Install the repo 
+
+First install the scrapper dependencies, following the tutorial on the repo: https://github.com/codelucas/newspaper?tab=readme-ov-file#get-it-now
+Only install the OS dependencies, as the package will be installed in a virtual env, below.
+
+Install first the uv package:
+
+```bash
+pip install uv
+```
+
+Then install the repo:
+
+```bash
+uv venv
+uv sync
+```
diff --git a/climateguard/models.py b/climateguard/models.py
@@ -0,0 +1,23 @@
+
+from pydantic import BaseModel
+
+
+class Claim(BaseModel):
+    claim: str
+    context: str
+    analysis: str
+    disinformation_score: str
+    disinformation_category: str
+
+class Claims(BaseModel):
+    claims: list[Claim]
+
+
+class Article(BaseModel):
+    title: str
+    content: str
+    url: str
+    date: str
+    topic: str
+    source: str
+
diff --git a/climateguard/news_scrapper.py b/climateguard/news_scrapper.py
@@ -0,0 +1,116 @@
+import requests
+from bs4 import BeautifulSoup
+import re
+from datetime import datetime
+import json
+from models import Article
+from newspaper import Article as NewspaperArticle
+from urllib.parse import urlparse
+
+class NewsScraper:
+    def __init__(self):
+        self.headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        }
+
+    def scrape_article(self, url):
+        # Try NewspaperArticle first
+        newspaper_article = NewspaperArticle(url)
+        newspaper_article.download()
+        newspaper_article.parse()
+
+        if newspaper_article.text:
+            return Article(
+                title=newspaper_article.title,
+                content=newspaper_article.text,
+                url=url,
+                date=str(newspaper_article.publish_date) if newspaper_article.publish_date else '',
+                topic='',  # NewspaperArticle doesn't provide a topic
+                source=url
+            )
+
+        # If NewspaperArticle fails to extract text, use custom scrapers
+        response = requests.get(url, headers=self.headers)
+        soup = BeautifulSoup(response.content, 'html.parser')
+
+        if 'lsm.lv' in url:
+            return self._scrape_lsm(soup, url)
+        elif 'delfi.lv' in url:
+            return self._scrape_delfi(soup, url)
+        elif 'nra.lv' in url:
+            return self._scrape_nra(soup, url)
+        else:
+            raise ValueError("Unsupported website")
+
+    def _scrape_lsm(self, soup, url):
+        content = ' '.join([p.text for p in soup.find_all('p')])
+        title = soup.find('h1').text.strip() if soup.find('h1') else ''
+        topic = soup.find('meta', {'property': 'article:section'})['content'] if soup.find('meta', {'property': 'article:section'}) else ''
+        date = soup.find('meta', {'property': 'article:published_time'})['content'] if soup.find('meta', {'property': 'article:published_time'}) else ''
+
+        return Article(
+            title=title,
+            content=content,
+            url=url,
+            date=date,
+            topic=topic,
+            source=url
+        )
+
+    def _scrape_delfi(self, soup, url):
+        content = ' '.join([p.text for p in soup.find_all('p', class_='C-article-body__paragraph')])
+        title = soup.find('h1', class_='C-article-headline').text.strip() if soup.find('h1', class_='C-article-headline') else ''
+        topic = soup.find('a', class_='C-article-info__category').text.strip() if soup.find('a', class_='C-article-info__category') else ''
+        date = soup.find('time', class_='C-article-info__time')['datetime'] if soup.find('time', class_='C-article-info__time') else ''
+
+        return Article(
+            title=title,
+            content=content,
+            url=url,
+            date=date,
+            topic=topic,
+            source=url
+        )
+
+    def _scrape_nra(self, soup, url):
+        content = ' '.join([p.text for p in soup.find_all('p', class_='article-text')])
+        title = soup.find('h1', class_='article-title').text.strip() if soup.find('h1', class_='article-title') else ''
+        topic = soup.find('span', class_='article-category').text.strip() if soup.find('span', class_='article-category') else ''
+        date = soup.find('time', class_='article-date')['datetime'] if soup.find('time', class_='article-date') else ''
+
+        return Article(
+            title=title,
+            content=content,
+            url=url,
+            date=date,
+            topic=topic,
+            source=url
+        )
+
+# Usage example:
+if __name__ == "__main__":
+    scraper = NewsScraper()
+    urls = [
+        "https://www.lsm.lv/raksts/dzive--stils/vide-un-dzivnieki/03.10.2024-zinojums-lidz-gadsimta-beigam-latvija-prognozeta-krasta-linijas-atkapsanas-par-47-72-metriem.a571093/",
+        "https://www.delfi.lv/bizness/56234200/eiropas-zinas/120042670/zinam-problemu-un-neizmantojam-risinajumus-ko-latvijas-iedzivotaji-doma-par-klimata-parmainam",
+        "https://www.delfi.lv/bizness/56234200/eiropas-zinas/120042670/kutri-izmantojam-dzerama-udens-kranus-kapec-iedzivotajiem-trukst-pamudinajuma-dzivot-zalak",
+        "https://nra.lv/pasaule/465572-sliktas-zinas-baltvina-cienitajiem.htm",
+        "https://www.lsm.lv/raksts/dzive--stils/vide-un-dzivnieki/20.09.2024-par-zalaku-rigu-spriedis-piecas-sestdienas-ko-sagaida-no-pirmas-iedzivotaju-klimata-asamblejas.a569637/"
+    ]
+
+    articles = []
+
+    for url in urls:
+        article = scraper.scrape_article(url)
+        articles.append(article)
+        print(f"Scraped: {article.title}")
+        print(f"Content length: {len(article.content)}")
+        print(f"Date: {article.date}")
+        print("---")
+
+    # Save to JSON
+    output_file = 'scraped_articles.json'
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump([article.dict() for article in articles], f, ensure_ascii=False, indent=4)
+
+    print(f"\nArticles saved to {output_file}")
diff --git a/climateguard/scrap_LSM.ipynb b/climateguard/scrap_LSM.ipynb
@@ -0,0 +1,227 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from climateguard.news_scrapper import NewsScraper\n",
+    "from bs4 import BeautifulSoup\n",
+    "import requests\n",
+    "from tqdm.autonotebook import tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ns = NewsScraper()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "BASE_URL = \"https://eng.lsm.lv\"\n",
+    "subpath = \"/society/environment/\"\n",
+    "response = requests.get(BASE_URL+subpath, headers=ns.headers)\n",
+    "soup = BeautifulSoup(response.content, 'html.parser')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "116"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "articles = soup.body.find_all('article')\n",
+    "len(articles)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "links = []\n",
+    "for art in articles:\n",
+    "    a = art.find('a')\n",
+    "    if a:\n",
+    "        url = a.attrs.get('href')\n",
+    "        if url:\n",
+    "            links.append(BASE_URL + url)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Attention : cette partie a été faite avec l'ancien code de scraping d'Amine et doit sûrement être adaptée si tu veux l'utiliser."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|          | 0/115 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 115/115 [02:13<00:00,  1.16s/it]\n"
+     ]
+    }
+   ],
+   "source": [
+    "scraped = []\n",
+    "for l in tqdm(links):\n",
+    "    scraped.append(ns.scrape_article(l))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.DataFrame(scraped)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'03.10.2024'"
+      ]
+     },
+     "execution_count": 49,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.url[0].split('/')[-2].split('-')[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dates = df['url'].apply(lambda s: s.split('/')[-2].split('-')[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df['date'] = pd.to_datetime(dates, format='%d.%m.%Y')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df['num_chars'] = df.full_text.apply(len)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 74,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "count      115.000000\n",
+       "mean      3024.626087\n",
+       "std       3020.180439\n",
+       "min        707.000000\n",
+       "25%       1454.000000\n",
+       "50%       2119.000000\n",
+       "75%       3344.000000\n",
+       "max      21682.000000\n",
+       "Name: num_chars, dtype: float64"
+      ]
+     },
+     "execution_count": 74,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.num_chars.describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -56,3 +56,4 @@ Thumbs.db @@
     notebooks/.ipynb_checkpoints
     .env
     .env
+    .venv