Skip to content

Commit

Permalink
Explore english LSM scraping
Browse files Browse the repository at this point in the history
  • Loading branch information
fraboniface committed Oct 4, 2024
1 parent f0707af commit 6447a49
Show file tree
Hide file tree
Showing 2 changed files with 228 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,4 @@ Thumbs.db
notebooks/.ipynb_checkpoints
.env
.env
.venv
227 changes: 227 additions & 0 deletions climateguard/scrap_LSM.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"from climateguard.news_scrapper import NewsScraper\n",
"from bs4 import BeautifulSoup\n",
"import requests\n",
"from tqdm.autonotebook import tqdm"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"ns = NewsScraper()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"BASE_URL = \"https://eng.lsm.lv\"\n",
"subpath = \"/society/environment/\"\n",
"response = requests.get(BASE_URL+subpath, headers=ns.headers)\n",
"soup = BeautifulSoup(response.content, 'html.parser')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"116"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"articles = soup.body.find_all('article')\n",
"len(articles)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"links = []\n",
"for art in articles:\n",
" a = art.find('a')\n",
" if a:\n",
" url = a.attrs.get('href')\n",
" if url:\n",
" links.append(BASE_URL + url)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Attention : cette partie a été faite avec l'ancien code de scraping d'Amine et doit sûrement être adaptée si tu veux l'utiliser."
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 0/115 [00:00<?, ?it/s]"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 115/115 [02:13<00:00, 1.16s/it]\n"
]
}
],
"source": [
"scraped = []\n",
"for l in tqdm(links):\n",
" scraped.append(ns.scrape_article(l))"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame(scraped)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'03.10.2024'"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.url[0].split('/')[-2].split('-')[0]"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
"dates = df['url'].apply(lambda s: s.split('/')[-2].split('-')[0])"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"df['date'] = pd.to_datetime(dates, format='%d.%m.%Y')"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
"df['num_chars'] = df.full_text.apply(len)"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 115.000000\n",
"mean 3024.626087\n",
"std 3020.180439\n",
"min 707.000000\n",
"25% 1454.000000\n",
"50% 2119.000000\n",
"75% 3344.000000\n",
"max 21682.000000\n",
"Name: num_chars, dtype: float64"
]
},
"execution_count": 74,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.num_chars.describe()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit 6447a49

Please sign in to comment.