Skip to content

Commit

Permalink
Arbital many summaries (#179)
Browse files Browse the repository at this point in the history
* first commit

* refactor markdownify_text with summaries

* added test for new arbital summary behaviour

* minor refactor of parse_arbital_link

* added edge cases to parse_arbital_link

* summaries optional key in data_entry added

* arbital now uses a list of summaries instead of appending many summaries together
  • Loading branch information
Thomas-Lemoine committed Sep 2, 2023
1 parent 4782ec3 commit 0bcf0d9
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 26 deletions.
6 changes: 5 additions & 1 deletion align_data/common/alignment_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,11 @@ def _add_authors(self, article: Article, authors: List[str]) -> Article:

def make_data_entry(self, data, **kwargs) -> Article:
data = merge_dicts(data, kwargs)

summaries = data.pop("summaries", [])
summary = data.pop("summary", None)
summaries += [summary] if summary else []

authors = data.pop("authors", [])
data['title'] = (data.get('title') or '').replace('\n', ' ').replace('\r', '') or None

Expand All @@ -80,7 +84,7 @@ def make_data_entry(self, data, **kwargs) -> Article:
**{k: v for k, v in data.items() if k in ARTICLE_MAIN_KEYS},
)
self._add_authors(article, authors)
if summary:
for summary in summaries: # Note: This will be skipped if summaries is empty
article.summaries.append(Summary(text=summary, source=self.name))
return article

Expand Down
32 changes: 16 additions & 16 deletions align_data/sources/arbital/arbital.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def flatten(val: Union[List[str], Tuple[str], str]) -> List[str]:
return [val]


def markdownify_text(current: List[str], view: Iterator[Tuple[str, str]]) -> Tuple[str, str]:
def markdownify_text(current: List[str], view: Iterator[Tuple[str, str]]) -> Tuple[List[str], str]:
"""
Recursively parse text segments from `view` to generate a markdown Abstract Syntax Tree (AST).
Expand All @@ -73,30 +73,30 @@ def markdownify_text(current: List[str], view: Iterator[Tuple[str, str]]) -> Tup
:param Iterator[Tuple[str, str]] view: An iterator that returns pairs of `part` and `next_part`, where `part` is the
current segment and `next_part` provides a lookahead.
:return: <summary>, <text>, where <summary> is the summary extracted from the text, and <text> is the text with all
:return: <summaries>, <text>, where <summaries> are the summaries extracted from the text, and <text> is the text with all
Arbital-specific markdown extensions replaced with standard markdown.
:rtype: Tuple[str, str]
:rtype: Tuple[List[str], str]
Example:
From the text: "[summary: A behaviorist [6w genie]]"
We get the input:
current = []
view = iter([('[', 'summary: A behaviorist '), ('summary: A behaviorist ', '['), ('[', '6w genie'), ('6w genie', ']'), (']', ']'), (']', None)])
The function should return:
`('A behaviorist [genie](https://arbital.com/p/6w)', '')`
`(['A behaviorist [genie](https://arbital.com/p/6w)'], '')`
Note:
This function assumes that `view` provides a valid Arbital markdown sequence. Malformed sequences might lead to
unexpected results.
"""
in_link = False
summary = ""
summaries = []

for part, next_part in view:
if part == "[":
# Recursively try to parse this new section - it's probably a link, but can be something else
sub_summary, text = markdownify_text([part], view)
summary += sub_summary + "\n\n"
sub_summaries, text = markdownify_text([part], view)
summaries.extend(sub_summaries)
current.append(text)

elif part == "]":
Expand All @@ -110,33 +110,34 @@ def markdownify_text(current: List[str], view: Iterator[Tuple[str, str]]) -> Tup

# Handle Arbital summary.
if descriptor.startswith("summary"):
# descriptor will be something like "summary(Technical): <contents>", so we split by `:`
summary_tag, summary_content = "".join(current[1:]).split(":", 1)
return f"{summary_tag}: {summary_content.strip()}", ""
return [f"{summary_tag}: {summary_content.strip()}"], ""

# Handle TODO section (ignore it).
if descriptor.startswith("todo"):
return "", ""
return [], ""

# Handle Arbital link (e.g., "6w genie" -> "[6w genie](https://arbital.com/p/6w)").
return "", parse_arbital_link(descriptor)
return [], parse_arbital_link(descriptor)

elif in_link and part == ")":
# this is the end of a markdown link - just join the contents, as they're already correct
return "", "".join(current + [part])
return [], "".join(current + [part])

elif in_link and current[-1] == "(" and next_part != ")":
# This link is strange... looks like it could be malformed?
# Assuming that it's malformed and missing a closing `)`
# This will remove any additional info in the link, but that seems a reasonable price?
words = part.split(" ")
return "", "".join(current + [words[0], ") ", " ".join(words[1:])])
return [], "".join(current + [words[0], ") ", " ".join(words[1:])])

else:
# Just your basic text - add it to the processed parts and go on your merry way
current.append(part)

# Otherwise just join all the parts back together
return summary.strip(), "".join(flatten(current)).strip()
return summaries, "".join(flatten(current)).strip()


def extract_text(text: str) -> Tuple[str, str]:
Expand All @@ -146,7 +147,6 @@ def extract_text(text: str) -> Tuple[str, str]:

@dataclass
class Arbital(AlignmentDataset):
summary_key: str = "summary"

ARBITAL_SUBSPACES = ["ai_alignment", "math", "rationality"]
done_key = "alias"
Expand Down Expand Up @@ -180,7 +180,7 @@ def get_item_key(self, item: str) -> str:
def process_entry(self, alias: str):
try:
page = self.get_page(alias)
summary, text = extract_text(page["text"])
summaries, text = extract_text(page["text"])

return self.make_data_entry(
{
Expand All @@ -193,7 +193,7 @@ def process_entry(self, alias: str):
"authors": self.extract_authors(page),
"alias": alias,
"tags": list(filter(None, map(self.get_title, page["tagIds"]))),
"summary": summary,
"summaries": summaries,
}
)
except Exception as e:
Expand Down
26 changes: 17 additions & 9 deletions tests/align_data/sources/test_arbital.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,37 +86,45 @@ def test_markdownify_text_contents_arbital_markdown(text, expected):
(
(
"[summary: summaries should be extracted] bla bla bla",
("summary: summaries should be extracted", "bla bla bla"),
(["summary: summaries should be extracted"], "bla bla bla"),
),
(
"[summary: summaries should be extracted] [summary(Technical): technical summary should be handled separately] bla bla bla",
(["summary: summaries should be extracted", "summary(Technical): technical summary should be handled separately"], "bla bla bla"),
),
(
"[summary: summaries should be extracted] bla bla bla [summary(Technical): summaries should work in the middle too] bla bla bla",
(["summary: summaries should be extracted", "summary(Technical): summaries should work in the middle too"], "bla bla bla bla bla bla"),
),
(
"[summary: \n whitespace should be stripped \n] bla bla bla",
("summary: whitespace should be stripped", "bla bla bla"),
(["summary: whitespace should be stripped"], "bla bla bla"),
),
(
"[summary(Bold): special summaries should be extracted] bla bla bla",
("summary(Bold): special summaries should be extracted", "bla bla bla"),
(["summary(Bold): special summaries should be extracted"], "bla bla bla"),
),
(
"[summary(Markdown): special summaries should be extracted] bla bla bla",
("summary(Markdown): special summaries should be extracted", "bla bla bla"),
(["summary(Markdown): special summaries should be extracted"], "bla bla bla"),
),
(
"[summary(BLEEEE): special summaries should be extracted] bla bla bla",
("summary(BLEEEE): special summaries should be extracted", "bla bla bla"),
(["summary(BLEEEE): special summaries should be extracted"], "bla bla bla"),
),
(
"[summary: markdown is handled: [bla](https://bla.bla)] bla bla bla",
("summary: markdown is handled: [bla](https://bla.bla)", "bla bla bla"),
(["summary: markdown is handled: [bla](https://bla.bla)"], "bla bla bla"),
),
(
"[summary: markdown is handled: [123 ble ble]] bla bla bla",
("summary: markdown is handled: [ble ble](https://arbital.com/p/123)", "bla bla bla"),
(["summary: markdown is handled: [ble ble](https://arbital.com/p/123)"], "bla bla bla"),
),
),
)
def test_markdownify_text_summary_and_content(text, expected):
summary, text = extract_text(text)
assert summary == expected[0]
summaries, text = extract_text(text)
assert summaries == expected[0]
assert text == expected[1]


Expand Down

0 comments on commit 0bcf0d9

Please sign in to comment.