diff --git a/align_data/common/alignment_dataset.py b/align_data/common/alignment_dataset.py index 7e46166..67b4bfe 100644 --- a/align_data/common/alignment_dataset.py +++ b/align_data/common/alignment_dataset.py @@ -71,7 +71,11 @@ def _add_authors(self, article: Article, authors: List[str]) -> Article: def make_data_entry(self, data, **kwargs) -> Article: data = merge_dicts(data, kwargs) + + summaries = data.pop("summaries", []) summary = data.pop("summary", None) + summaries += [summary] if summary else [] + authors = data.pop("authors", []) data['title'] = (data.get('title') or '').replace('\n', ' ').replace('\r', '') or None @@ -80,7 +84,7 @@ def make_data_entry(self, data, **kwargs) -> Article: **{k: v for k, v in data.items() if k in ARTICLE_MAIN_KEYS}, ) self._add_authors(article, authors) - if summary: + for summary in summaries: # Note: This will be skipped if summaries is empty article.summaries.append(Summary(text=summary, source=self.name)) return article diff --git a/align_data/sources/arbital/arbital.py b/align_data/sources/arbital/arbital.py index f608793..b08393c 100644 --- a/align_data/sources/arbital/arbital.py +++ b/align_data/sources/arbital/arbital.py @@ -59,7 +59,7 @@ def flatten(val: Union[List[str], Tuple[str], str]) -> List[str]: return [val] -def markdownify_text(current: List[str], view: Iterator[Tuple[str, str]]) -> Tuple[str, str]: +def markdownify_text(current: List[str], view: Iterator[Tuple[str, str]]) -> Tuple[List[str], str]: """ Recursively parse text segments from `view` to generate a markdown Abstract Syntax Tree (AST). @@ -73,9 +73,9 @@ def markdownify_text(current: List[str], view: Iterator[Tuple[str, str]]) -> Tup :param Iterator[Tuple[str, str]] view: An iterator that returns pairs of `part` and `next_part`, where `part` is the current segment and `next_part` provides a lookahead. - :return: , , where is the summary extracted from the text, and is the text with all + :return: , , where are the summaries extracted from the text, and is the text with all Arbital-specific markdown extensions replaced with standard markdown. - :rtype: Tuple[str, str] + :rtype: Tuple[List[str], str] Example: From the text: "[summary: A behaviorist [6w genie]]" @@ -83,20 +83,20 @@ def markdownify_text(current: List[str], view: Iterator[Tuple[str, str]]) -> Tup current = [] view = iter([('[', 'summary: A behaviorist '), ('summary: A behaviorist ', '['), ('[', '6w genie'), ('6w genie', ']'), (']', ']'), (']', None)]) The function should return: - `('A behaviorist [genie](https://arbital.com/p/6w)', '')` + `(['A behaviorist [genie](https://arbital.com/p/6w)'], '')` Note: This function assumes that `view` provides a valid Arbital markdown sequence. Malformed sequences might lead to unexpected results. """ in_link = False - summary = "" + summaries = [] for part, next_part in view: if part == "[": # Recursively try to parse this new section - it's probably a link, but can be something else - sub_summary, text = markdownify_text([part], view) - summary += sub_summary + "\n\n" + sub_summaries, text = markdownify_text([part], view) + summaries.extend(sub_summaries) current.append(text) elif part == "]": @@ -110,33 +110,34 @@ def markdownify_text(current: List[str], view: Iterator[Tuple[str, str]]) -> Tup # Handle Arbital summary. if descriptor.startswith("summary"): + # descriptor will be something like "summary(Technical): ", so we split by `:` summary_tag, summary_content = "".join(current[1:]).split(":", 1) - return f"{summary_tag}: {summary_content.strip()}", "" + return [f"{summary_tag}: {summary_content.strip()}"], "" # Handle TODO section (ignore it). if descriptor.startswith("todo"): - return "", "" + return [], "" # Handle Arbital link (e.g., "6w genie" -> "[6w genie](https://arbital.com/p/6w)"). - return "", parse_arbital_link(descriptor) + return [], parse_arbital_link(descriptor) elif in_link and part == ")": # this is the end of a markdown link - just join the contents, as they're already correct - return "", "".join(current + [part]) + return [], "".join(current + [part]) elif in_link and current[-1] == "(" and next_part != ")": # This link is strange... looks like it could be malformed? # Assuming that it's malformed and missing a closing `)` # This will remove any additional info in the link, but that seems a reasonable price? words = part.split(" ") - return "", "".join(current + [words[0], ") ", " ".join(words[1:])]) + return [], "".join(current + [words[0], ") ", " ".join(words[1:])]) else: # Just your basic text - add it to the processed parts and go on your merry way current.append(part) # Otherwise just join all the parts back together - return summary.strip(), "".join(flatten(current)).strip() + return summaries, "".join(flatten(current)).strip() def extract_text(text: str) -> Tuple[str, str]: @@ -146,7 +147,6 @@ def extract_text(text: str) -> Tuple[str, str]: @dataclass class Arbital(AlignmentDataset): - summary_key: str = "summary" ARBITAL_SUBSPACES = ["ai_alignment", "math", "rationality"] done_key = "alias" @@ -180,7 +180,7 @@ def get_item_key(self, item: str) -> str: def process_entry(self, alias: str): try: page = self.get_page(alias) - summary, text = extract_text(page["text"]) + summaries, text = extract_text(page["text"]) return self.make_data_entry( { @@ -193,7 +193,7 @@ def process_entry(self, alias: str): "authors": self.extract_authors(page), "alias": alias, "tags": list(filter(None, map(self.get_title, page["tagIds"]))), - "summary": summary, + "summaries": summaries, } ) except Exception as e: diff --git a/tests/align_data/sources/test_arbital.py b/tests/align_data/sources/test_arbital.py index 19ad8e9..87fed8c 100644 --- a/tests/align_data/sources/test_arbital.py +++ b/tests/align_data/sources/test_arbital.py @@ -86,37 +86,45 @@ def test_markdownify_text_contents_arbital_markdown(text, expected): ( ( "[summary: summaries should be extracted] bla bla bla", - ("summary: summaries should be extracted", "bla bla bla"), + (["summary: summaries should be extracted"], "bla bla bla"), + ), + ( + "[summary: summaries should be extracted] [summary(Technical): technical summary should be handled separately] bla bla bla", + (["summary: summaries should be extracted", "summary(Technical): technical summary should be handled separately"], "bla bla bla"), + ), + ( + "[summary: summaries should be extracted] bla bla bla [summary(Technical): summaries should work in the middle too] bla bla bla", + (["summary: summaries should be extracted", "summary(Technical): summaries should work in the middle too"], "bla bla bla bla bla bla"), ), ( "[summary: \n whitespace should be stripped \n] bla bla bla", - ("summary: whitespace should be stripped", "bla bla bla"), + (["summary: whitespace should be stripped"], "bla bla bla"), ), ( "[summary(Bold): special summaries should be extracted] bla bla bla", - ("summary(Bold): special summaries should be extracted", "bla bla bla"), + (["summary(Bold): special summaries should be extracted"], "bla bla bla"), ), ( "[summary(Markdown): special summaries should be extracted] bla bla bla", - ("summary(Markdown): special summaries should be extracted", "bla bla bla"), + (["summary(Markdown): special summaries should be extracted"], "bla bla bla"), ), ( "[summary(BLEEEE): special summaries should be extracted] bla bla bla", - ("summary(BLEEEE): special summaries should be extracted", "bla bla bla"), + (["summary(BLEEEE): special summaries should be extracted"], "bla bla bla"), ), ( "[summary: markdown is handled: [bla](https://bla.bla)] bla bla bla", - ("summary: markdown is handled: [bla](https://bla.bla)", "bla bla bla"), + (["summary: markdown is handled: [bla](https://bla.bla)"], "bla bla bla"), ), ( "[summary: markdown is handled: [123 ble ble]] bla bla bla", - ("summary: markdown is handled: [ble ble](https://arbital.com/p/123)", "bla bla bla"), + (["summary: markdown is handled: [ble ble](https://arbital.com/p/123)"], "bla bla bla"), ), ), ) def test_markdownify_text_summary_and_content(text, expected): - summary, text = extract_text(text) - assert summary == expected[0] + summaries, text = extract_text(text) + assert summaries == expected[0] assert text == expected[1]