Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refacto: parsing description #84 #86

Merged
merged 1 commit into from
Nov 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@ services:
#entrypoint: ["sleep", "1200"] # use to debug the container if needed
entrypoint: ["python", "quotaclimat/data_ingestion/ingest_db/ingest_sitemap_in_db.py"]
environment:
ENV: docker
LOGLEVEL: debug # Change me to info (warning, error) to have less log
ENV: docker # change me to prod for real cases
LOGLEVEL: INFO # Change me to info (debug, info, warning, error) to have less log
PYTHONPATH: /app
POSTGRES_USER: user
POSTGRES_DB: barometre
Expand Down
23 changes: 16 additions & 7 deletions quotaclimat/data_ingestion/scrap_sitemap.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from quotaclimat.data_ingestion.config_sitemap import (SITEMAP_CONFIG, SITEMAP_TEST_CONFIG, SITEMAP_DOCKER_CONFIG, MEDIA_CONFIG)
from postgres.schemas.models import get_sitemap_cols
from quotaclimat.data_ingestion.scrap_html.scrap_description_article import get_meta_news
from quotaclimat.data_ingestion.scrap_html.scrap_description_article import get_meta_news, agent
import asyncio
import hashlib

Expand Down Expand Up @@ -180,8 +180,9 @@ async def query_one_sitemap_and_transform(media: str, sitemap_conf: Dict, df_fro
"""
try:
logging.info("\n\nParsing media %s with %s" % (media, sitemap_conf["sitemap_url"]))
logging.info(f"User-agent: { agent['User-Agent'] }")
#@see https://advertools.readthedocs.io/en/master/advertools.sitemaps.html#news-sitemaps

adv.sitemaps.headers['User-Agent'] = agent["User-Agent"]
temp_df = adv.sitemap_to_df(sitemap_conf["sitemap_url"])

temp_df.rename(columns={"loc": "url"}, inplace=True)
Expand Down Expand Up @@ -216,14 +217,22 @@ async def query_one_sitemap_and_transform(media: str, sitemap_conf: Dict, df_fro
#keep only unknown id to not parse every website for new_description
difference_df = get_diff_from_df(df, df_from_pg)

# concurrency : https://stackoverflow.com/a/67944888/3535853
# https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
difference_df['news_description'] = await asyncio.gather(*(add_news_meta(row["url"], media, row["news_title"]) for (_, row) in difference_df.iterrows()))

difference_df['news_description'] = await get_description_article(media, difference_df)

return difference_df
except Exception as err:
logging.error(
"Sitemap query error for %s: %s : %s"
% (media, sitemap_conf["sitemap_url"], err)
)
return None
return None

# concurrency : https://stackoverflow.com/a/67944888/3535853
# https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔭

async def get_description_article(media, article_df):
article_tasks = []
for (_, row) in article_df.iterrows():
description = add_news_meta(row["url"], media, row["news_title"])
article_tasks.append(description)

return await asyncio.gather(*article_tasks)
30 changes: 23 additions & 7 deletions test/sitemap/test_scrap_html.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,30 @@
import logging
import pytest
import os
import pandas as pd
from quotaclimat.data_ingestion.scrap_html.scrap_description_article import get_meta_news, get_hat_20minutes, get_url_content
from quotaclimat.data_ingestion.scrap_sitemap import get_description_article
from bs4 import BeautifulSoup
from utils import get_localhost, debug_df

localhost = ""
if(os.environ.get("ENV") == "docker"):
localhost ="http://nginxtest:80"
else:
localhost = "http://localhost:8000"
localhost = get_localhost()

@pytest.mark.asyncio
async def test_get_description_article():
url_to_parse = f"{localhost}/mediapart_website.html"
media = "Le Figaro"
df_articles = pd.DataFrame([{
"url" : url_to_parse,
"news_title" :media,
}])

expected_result = pd.DataFrame([{
"url" : url_to_parse,
"news_title" :media,
"news_description" : "description could be parsed with success"
}])

df_articles["news_description"] = await get_description_article(media, df_articles)
debug_df(df_articles)
pd.testing.assert_frame_equal(df_articles.reset_index(drop=True), expected_result.reset_index(drop=True))

@pytest.mark.asyncio
async def test_get_meta_news():
Expand Down
18 changes: 5 additions & 13 deletions test/sitemap/test_scrap_sitemap.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,12 @@
import logging

import numpy as np
import pandas as pd
import pytest
import os
from quotaclimat.data_ingestion.scrap_sitemap import (filter_on_date, find_sections, get_consistent_hash, get_diff_from_df, query_one_sitemap_and_transform, get_sections_from_url, normalize_section)
from quotaclimat.data_ingestion.config_sitemap import (SITEMAP_CONFIG)
from datetime import datetime, timedelta

from utils import get_localhost
from quotaclimat.data_ingestion.ingest_db.ingest_sitemap_in_db import get_sitemap_list

url_to_parse = ""
if(os.environ.get("ENV") == "docker"):
url_to_parse ="http://nginxtest:80/"
else:
url_to_parse = "http://localhost:8000/"
url_to_parse = get_localhost()

def test_normalize_section():
assert normalize_section(["test", "pizza"]) == ["test", "pizza"]
Expand All @@ -32,7 +24,7 @@ def test_get_sitemap_list():
sitemap = list(get_sitemap_list())[0]
# locally we test only a few items
sitemap_url = sitemap
sitemap_url == "http://nginxtest:80/sitemap_news_figaro_3.xml"
sitemap_url == f"${url_to_parse}/sitemap_news_figaro_3.xml"

@pytest.mark.asyncio
async def test_query_one_sitemap_and_transform():
Expand All @@ -48,7 +40,7 @@ async def test_query_one_sitemap_and_transform():
output = await query_one_sitemap_and_transform(media, sitemap_config[media], pg_df)
title = "EN DIRECT - Conflit Hamas-Israël : l’armée israélienne dit avoir frappé Gaza avec 4000 tonnes d’explosifs depuis samedi"
expected_result = pd.DataFrame([{
"url" : f"{url_to_parse}mediapart_website.html",
"url" : f"{url_to_parse}/mediapart_website.html",
"lastmod" :pd.Timestamp("2023-10-12 15:34:28"),
"publication_name" :"Le Figaro",
"publication_language" :"fr",
Expand Down Expand Up @@ -96,7 +88,7 @@ async def test_query_one_sitemap_and_transform_hat_parsing():
title = "Grève du 13 octobre : SNCF, RATP, aérien, médecins… Retrouvez le détail des perturbations à prévoir"
publication_name = "Le Figaro"
expected_result = pd.DataFrame([{
"url" : f"{url_to_parse}20minutes_website.html",
"url" : f"{url_to_parse}/20minutes_website.html",
"lastmod" :pd.Timestamp("2023-10-12 15:34:21"),
"publication_name" :"Le Figaro",
"publication_language" :"fr",
Expand Down
14 changes: 14 additions & 0 deletions test/sitemap/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import logging
import os

def get_localhost():
localhost = ""
if(os.environ.get("ENV") == "docker"):
localhost ="http://nginxtest:80"
else:
localhost = "http://localhost:8000"
return localhost

def debug_df(df):
logging.warning("--------------------DEBUG DF-------------------")
logging.warning(df.head(1).to_string())