From f3f11e893604e986672e39127f3a03641ad366de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20Kl=C3=BCber?= Date: Tue, 2 Apr 2024 19:38:20 +0200 Subject: [PATCH] - removed rank - removed boxes and added url and text in DirectAnswer - changed xpath str to xpaths List[str] --- archive_query_log/orm.py | 13 +-- .../parsers/warc_direct_answers.py | 108 ++++++++---------- 2 files changed, 55 insertions(+), 66 deletions(-) diff --git a/archive_query_log/orm.py b/archive_query_log/orm.py index 24afe24..3f72887 100644 --- a/archive_query_log/orm.py +++ b/archive_query_log/orm.py @@ -1,7 +1,7 @@ from datetime import datetime from functools import cached_property from re import Pattern, compile as pattern -from typing import Literal +from typing import Literal, List from elasticsearch_dsl import Document, Keyword, Text, Date, RankFeature, \ InnerDoc as InnerDocument, Object, Index, Integer, Nested, Long, Boolean @@ -192,14 +192,12 @@ class Snippet(SnippetId): class DirectAnswerId(InnerDocument): id: str = Keyword() - rank: int = Integer() class DirectAnswer(DirectAnswerId): content: str = Text() - big_box: str | None = Keyword() - small_box: str | None = Text() - right_box: str | None = Text() + url: str | None = Keyword() + text: str | None = Text() class Serp(BaseDocument): @@ -220,7 +218,7 @@ class Serp(BaseDocument): warc_query_parser: InnerParser | None = Object(InnerParser) warc_snippets: list[SnippetId] | None = Nested(SnippetId) warc_snippets_parser: InnerParser | None = Object(InnerParser) - warc_direct_answer: list[SnippetId] | None = Nested(SnippetId) + warc_direct_answer: list[DirectAnswerId] | None = Nested(DirectAnswerId) warc_direct_answer_parser: InnerParser | None = Object(InnerParser) # rendered_warc_location: WarcLocation | None = Object(WarcLocation) @@ -461,9 +459,8 @@ class WarcDirectAnswerParser(BaseDocument): url_pattern_regex: str | None = Keyword() priority: float | None = RankFeature(positive_score_impact=True) parser_type: WarcDirectAnswerParserType = Keyword() - xpath: str | None = Keyword() + xpaths: List[str] | None = Keyword() url_xpath: str | None = Keyword() - title_xpath: str | None = Keyword() text_xpath: str | None = Keyword() @cached_property diff --git a/archive_query_log/parsers/warc_direct_answers.py b/archive_query_log/parsers/warc_direct_answers.py index b2aa2fa..41e899b 100644 --- a/archive_query_log/parsers/warc_direct_answers.py +++ b/archive_query_log/parsers/warc_direct_answers.py @@ -1,6 +1,6 @@ from functools import cache from itertools import chain -from typing import Iterable, Iterator +from typing import Iterable, Iterator, List from urllib.parse import urljoin from uuid import uuid5 @@ -31,15 +31,14 @@ def add_warc_direct_answer_parser( url_pattern_regex: str | None, priority: float | None, parser_type: WarcDirectAnswerParserType, - xpath: str | None, - big_box_xpath: str | None, - small_box_xpath: str | None, - right_box_xpath: str | None, + xpaths: List[str] | None, + url_xpath: str | None, + text_xpath: str | None, ) -> None: if priority is not None and priority <= 0: raise ValueError("Priority must be strictly positive.") if parser_type == "xpath": - if xpath is None: + if xpaths is None: raise ValueError("No XPath given.") else: raise ValueError(f"Invalid parser type: {parser_type}") @@ -59,10 +58,9 @@ def add_warc_direct_answer_parser( url_pattern_regex=url_pattern_regex, priority=priority, parser_type=parser_type, - xpath=xpath, - big_box_xpath=big_box_xpath, - small_box_xpath=small_box_xpath, - right_box_xpath=right_box_xpath, + xpaths=xpaths, + url_xpath=url_xpath, + text_xpath=text_xpath, ) parser.save(using=config.es.client) @@ -81,61 +79,56 @@ def _parse_warc_direct_answer( # Parse direct answer. if parser.parser_type == "xpath": - if parser.xpath is None: + if parser.xpaths is None: raise ValueError("No XPath given.") with open_warc(warc_store, warc_location) as record: tree = parse_xml_tree(record) if tree is None: return None - elements = safe_xpath(tree, parser.xpath, _Element) - if len(elements) == 0: - return None + for xpath in parser.xpaths: + elements = safe_xpath(tree, xpath, _Element) + if len(elements) == 0: + return None - direct_answers = [] - element: _Element - for i, element in enumerate(elements): - big_box: str | None = None - if parser.big_box_xpath is not None: - big_boxs = safe_xpath(element, parser.big_box_xpath, str) - if len(big_boxs) > 0: - big_box = big_boxs[0].strip() - small_box: str | None = None - if parser.small_box_xpath is not None: - small_boxs = safe_xpath(element, parser.small_box_xpath, str) - if len(small_boxs) > 0: - small_box = small_boxs[0].strip() - right_box: str | None = None - if parser.right_box_xpath is not None: - right_boxs = safe_xpath(element, parser.right_box_xpath, str) - if len(right_boxs) > 0: - right_box = right_boxs[0].strip() + direct_answers = [] + element: _Element + for i, element in enumerate(elements): + url: str | None = None + if parser.url_xpath is not None: + urls = safe_xpath(element, parser.url_xpath, str) + if len(urls) > 0: + url = urls[0].strip() + url = urljoin(capture_url, url) + text: str | None = None + if parser.text_xpath is not None: + texts = safe_xpath(element, parser.text_xpath, str) + if len(texts) > 0: + text = texts[0].strip() - content: str = tostring( - element, - encoding=str, - method="xml", - pretty_print=False, - with_tail=True, - ) - direct_answer_id_components = ( - serp_id, - parser.id, - str(hash(content)), - str(i), - ) - direct_answer_id = str(uuid5( - NAMESPACE_RESULT, - ":".join(direct_answer_id_components), - )) - direct_answers.append(DirectAnswer( - id=direct_answer_id, - rank=i, - content=content, - big_box=big_box, - small_box=small_box, - right_box=right_box, - )) + content: str = tostring( + element, + encoding=str, + method="xml", + pretty_print=False, + with_tail=True, + ) + direct_answer_id_components = ( + serp_id, + parser.id, + str(hash(content)), + str(i), + ) + direct_answer_id = str(uuid5( + NAMESPACE_RESULT, + ":".join(direct_answer_id_components), + )) + direct_answers.append(DirectAnswer( + id=direct_answer_id, + content=content, + url=url, + text=text, + )) return direct_answers else: raise ValueError(f"Unknown parser type: {parser.parser_type}") @@ -216,7 +209,6 @@ def _parse_serp_warc_direct_answer_action( warc_direct_answers=[ DirectAnswerId( id=direct_answer.id, - rank=direct_answer.rank, ) for direct_answer in warc_direct_answers ],