Explore english LSM scraping

dataforgoodfr · Oct 4, 2024 · 6447a49 · 6447a49
1 parent f0707af
commit 6447a49
Show file tree

Hide file tree

Showing 2 changed files with 228 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -56,3 +56,4 @@ Thumbs.db
 notebooks/.ipynb_checkpoints
 .env
 .env
+.venv
diff --git a/climateguard/scrap_LSM.ipynb b/climateguard/scrap_LSM.ipynb
@@ -0,0 +1,227 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from climateguard.news_scrapper import NewsScraper\n",
+    "from bs4 import BeautifulSoup\n",
+    "import requests\n",
+    "from tqdm.autonotebook import tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ns = NewsScraper()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "BASE_URL = \"https://eng.lsm.lv\"\n",
+    "subpath = \"/society/environment/\"\n",
+    "response = requests.get(BASE_URL+subpath, headers=ns.headers)\n",
+    "soup = BeautifulSoup(response.content, 'html.parser')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "116"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "articles = soup.body.find_all('article')\n",
+    "len(articles)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "links = []\n",
+    "for art in articles:\n",
+    "    a = art.find('a')\n",
+    "    if a:\n",
+    "        url = a.attrs.get('href')\n",
+    "        if url:\n",
+    "            links.append(BASE_URL + url)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Attention : cette partie a été faite avec l'ancien code de scraping d'Amine et doit sûrement être adaptée si tu veux l'utiliser."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|          | 0/115 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 115/115 [02:13<00:00,  1.16s/it]\n"
+     ]
+    }
+   ],
+   "source": [
+    "scraped = []\n",
+    "for l in tqdm(links):\n",
+    "    scraped.append(ns.scrape_article(l))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.DataFrame(scraped)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'03.10.2024'"
+      ]
+     },
+     "execution_count": 49,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.url[0].split('/')[-2].split('-')[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dates = df['url'].apply(lambda s: s.split('/')[-2].split('-')[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df['date'] = pd.to_datetime(dates, format='%d.%m.%Y')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df['num_chars'] = df.full_text.apply(len)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 74,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "count      115.000000\n",
+       "mean      3024.626087\n",
+       "std       3020.180439\n",
+       "min        707.000000\n",
+       "25%       1454.000000\n",
+       "50%       2119.000000\n",
+       "75%       3344.000000\n",
+       "max      21682.000000\n",
+       "Name: num_chars, dtype: float64"
+      ]
+     },
+     "execution_count": 74,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.num_chars.describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -56,3 +56,4 @@ Thumbs.db @@
     notebooks/.ipynb_checkpoints
     .env
     .env
+    .venv