diff --git a/align_data/common/alignment_dataset.py b/align_data/common/alignment_dataset.py index 5d0632b2..71f1a608 100644 --- a/align_data/common/alignment_dataset.py +++ b/align_data/common/alignment_dataset.py @@ -52,6 +52,9 @@ class AlignmentDataset: COOLDOWN = 0 """An optional cool down between processing entries""" + lazy_eval = False + """Whether to lazy fetch items. This is nice in that it will start processing, but messes up the progress bar.""" + # Internal housekeeping variables _entry_idx = 0 """Used internally for writing debugging info - each file write will increment it""" @@ -142,7 +145,13 @@ def unprocessed_items(self, items=None): def not_processed(item): return self.get_item_key(item) not in self._outputted_items - return tqdm(list(filter(not_processed, items or self.items_list))) + filtered = filter(not_processed, items or self.items_list) + + # greedily fetch all items if not lazy eval. This makes the progress bar look nice + if not self.lazy_eval: + filtered = list(filtered) + + return tqdm(filtered) def fetch_entries(self): """Get all entries to be written to the file.""" diff --git a/align_data/greaterwrong/__init__.py b/align_data/greaterwrong/__init__.py index 3bcc525f..8f4a079c 100644 --- a/align_data/greaterwrong/__init__.py +++ b/align_data/greaterwrong/__init__.py @@ -1,16 +1,25 @@ -from .greaterwrong import GreaterWrong +from .greaterwrong import GreaterWrong, fetch_ea_forum_topics, fetch_LW_tags GREATERWRONG_REGISTRY = [ GreaterWrong( name="lesswrong", - base_url='https://www.greaterwrong.com', + base_url='https://www.lesswrong.com', start_year=2005, min_karma=1, + af=False, + ), + GreaterWrong( + name="alignmentforum", + base_url='https://www.alignmentforum.org', + start_year=2009, + min_karma=1, + af=True, ), GreaterWrong( name="eaforum", - base_url='https://ea.greaterwrong.com', + base_url='https://forum.effectivealtruism.org', start_year=2011, min_karma=1, + af=False, ) ] diff --git a/align_data/greaterwrong/greaterwrong.py b/align_data/greaterwrong/greaterwrong.py index a63ea140..0d45014f 100644 --- a/align_data/greaterwrong/greaterwrong.py +++ b/align_data/greaterwrong/greaterwrong.py @@ -6,199 +6,44 @@ import requests import jsonlines -from bs4 import BeautifulSoup, Tag +from bs4 import BeautifulSoup from tqdm import tqdm from markdownify import markdownify -from align_data.common.alignment_dataset import AlignmentDataset , DataEntry +from align_data.common.alignment_dataset import AlignmentDataset, DataEntry logger = logging.getLogger(__name__) -def extract_author(base_url, a): - return { - 'fullName': a.attrs.get('data-full-name'), - 'userId': a.attrs.get('data-userid'), - 'userLink': a.attrs.get('href') and base_url + a.attrs.get('href'), - 'name': a.text, - } - - -def get_attr(elem: Tag, tag: str, selector, attr=None, processor=lambda x: x): - """A generic extractor of HTML info, which will also handle the item not existing. - - :param Tag elem: the element to search in - :param str tag: the HTML tag to look for, e.g. `div`. Can also be `None`, in which case any tag will work - :param dict selector: additional selector to drill down on, e.g. `{'class': 'bla'}` - :param str attr: the attribute of the element to extract, e.g. 'href'. Ignored if `None` - :param fn processor: an optional transformer to be run on the extracted value for postprocessing - """ - item = elem.find(tag, selector) - value = item - if attr and item: - value = item and item.get(attr) - return value and processor(value) - - -def parse_karma(meta_div: Tag): - """Extract the karma from the given element. - - :param Tag meta_div: the element to be processed - this is the div containing url, karma, authors etc. - :returns: a `(score, karma)` tuple, where `score` is the overall karma, while `karma` is a dict of per site karma - """ - site = get_attr(meta_div, 'a', {'class': 'lw2-link'}, processor=lambda a: a.get('title') or next(a.children)) - karma_text = get_attr(meta_div, 'span', {'class': 'karma-value'}, processor=lambda d: d.text.strip()) - if not karma_text: - score, karma = None, {} - # In the case of this post only being on one server, the karma is provided as a string like "123 points" - elif 'point' in karma_text: - score = int(karma_text.split()[0].replace('−', '-')) - karma = {site: score} - # When it's e.g. an alignment forum post, it will have site specific karma, like "LW: 123, AF: 432" - elif karma_text: - parts = karma_text.replace(':', '').split() - karma = {k: int(v) for k, v in zip(parts[::2], parts[1::2])} - score = list(karma.values())[0] - else: - score, karma = None, {} - return score, karma - - -def extract_metadata(base_url: str, post: Tag, meta_div=None): - """Extract the metadata of the post/comment. - - :param str base_url: the base url of the forum being used, e.g. 'https://lesswrong.com' - :param Tag post: the HTML element to process - :param Tag meta_div: used if the metadata is in multiple tags. Will use `post` if `None` - - :returns: a dict of extracted metadata values. Values that are empty will be removed - """ - meta_div = meta_div or post - score, karma = parse_karma(meta_div) - - metadata = { - 'title': next(post.children).text, - 'url': get_attr(meta_div, 'a', {'class': 'lw2-link'}, 'href'), - 'post_url': get_attr(post, 'a', {'class': 'post-title-link'}, 'href', lambda url: base_url + url), - 'link_post': get_attr(post, 'a', {'class': 'link-post-link'}, 'href'), - 'authors': [extract_author(base_url, a) for a in meta_div.findChildren('a', {'class': 'author'})], - 'date_published': get_attr( - meta_div, None, {'class': 'date'}, - processor=lambda d: datetime.datetime.strptime(d.text.strip(), '%d %b %Y %H:%M %Z').isoformat() - ), - 'votes': get_attr(meta_div, 'span', {'class': 'karma-value'}, 'title', lambda v: int(v.split()[0])), - 'score': score, - 'karma': karma, - 'tags': get_attr(meta_div, 'div', {'id': 'tags'}, processor=lambda d: [a.text.strip() for a in d.find_all('a')]), - 'words': get_attr(meta_div, 'span', {'class': 'read-time'}, 'title'), #meta_div.find('span', {'class': 'read-time'}).attrs.get('title'), - } - return {k: v for k, v in metadata.items() if v} - - -def fetch_month_urls(base_url: str, year: int, month: int, delay=1): - """Fetch all posts from the given `year` and `month` from `base_url`. - - This will automatically paginate through all available pages. - GreaterWrong has a limit of 2000 entries per pagination, which is why this is done per month - - To avoid clobbering the service, `delay` seconds will be waited between each network call. - - :returns: a list of metadata dicts for each post - """ - all_posts = [] - - url = f'/archive/{year}/{month}' - while url: - logger.debug('Fetching items for %s', url) - res = requests.get(base_url + url) - soup = BeautifulSoup(res.text, "html.parser") - - posts = soup.find_all('h1', {'class': 'listing'}) - all_posts += [extract_metadata(base_url, post, post.find_next_sibling('div')) for post in posts] - - url = soup.find('a', {'class': 'nav-item-next'}) - url = url and url.attrs.get('href').replace('#', '') - - time.sleep(delay) - - logger.debug('Found %s posts for %s/%s', len(all_posts), year, month) - return all_posts - - -def fetch_all_urls(base_url: str, urls_data_path: Path, start_year: int, delay=1): - """Fetch the metadata of all posts from `base_url`, starting from `start_year`. - - This will create a separate data file for each month, starting from the earliest one checked. The resulting - files will contain a JSON object per line containing the metadata of each post. If there were no posts in a - give month (this happened in the beginning of LW, for example), then an empty file will be created to mark - that month as checked. The latest month will always be rechecked, as it's most likely not up to date. - """ - # Any url file that was created contains all urls for that month and so can be skipped. This - # assumption only holds if post publication dates cannot be changed, and if posts won't retroactively - # appear - both of which seem reasonable. Ignore the latest file though, as it probably won't contain - # all urls for that given month - known_urls = sorted(urls_data_path.glob('*'))[:-1] - - now = datetime.date.today() - # Construct a big list of all months, rather than having nested loops, coz then - # tqdm can show a nice loading bar - dates = [ - (year, month) - for year in range(start_year, now.year + 1) - for month in range(1, 13) - ] - for year, month in tqdm(dates): - data_file = urls_data_path / f'{year}_{month}.jsonl' - - if data_file in known_urls: - logger.debug(f'Already processed {data_file.name} - skipping') - continue - - try: - posts = fetch_month_urls(base_url, year, month, delay) - except Exception as e: - logger.error(e) - else: - with jsonlines.open(data_file , mode='w') as writer: - writer.write_all(posts) - - # No point in looking for future posts... - if year == now.year and month == now.month: - break - - -def parse_comments(base_url: str, elem: Tag): - """Recursively extract the whole comment tree from the given HTML `elem`.""" - if not elem or not elem.get('class'): - return None - if 'comment-thread' in elem.get('class') or 'comments' in elem.get('class'): - return list(filter(None, map(lambda x: parse_comments(base_url, x), elem.children))) - if 'comment-item' in elem.get('class'): - comment = elem.find('div', {'class': 'comment'}) - if 'deleted-comment' in comment.get('class'): - return None +def fetch_LW_tags(url): + res = requests.get( + url + '/tag/ai', + headers={'User-Agent': 'Mozilla /5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/113.0'}, + ) + soup = BeautifulSoup(res.content, "html.parser") + container = soup.find('div', {'class': 'TagPage-description'}).find('table') + return {a.text.strip() for a in container.find_all('a') if '/tag/' in a.get('href')} - metadata = extract_metadata(base_url, comment) - return { - 'text': comment.find('div', {'class': 'body-text'}).text, - 'votes': metadata.get('votes'), - 'score': metadata.get('score'), - 'karma': metadata.get('karma'), - 'url': metadata.get('url'), - 'date_published': metadata['date_published'], - 'author': metadata.get('authors', [{}])[0].get('name'), - 'comments': parse_comments(base_url, elem.find('ul', {'class': 'comment-thread'})), - } +def fetch_ea_forum_topics(url): + res = requests.get(url + '/topics/ai-safety') + soup = BeautifulSoup(res.content, "html.parser") + container = soup.find('div', {'class': 'SidebarSubtagsBox-root'}) + return {a.text.strip() for a in container.find_all('a') if '/topics/' in a.get('href', '')} - return None +def get_allowed_tags(url, name): + if name == 'alignmentforum': + return set() + try: + if name == 'lesswrong': + return fetch_LW_tags(url) + if name == 'eaforum': + return fetch_ea_forum_topics(url) + except Exception: + raise ValueError('Could not fetch tags! Please retry') -def fetch_ai_tags(url): - res = requests.get(url + '/tag/ai') - soup = BeautifulSoup(res.content, "html.parser") - container = soup.find('div', {'class': 'tag-description'}).find('table') - return [a.text.strip() for a in container.find_all('a') if a.get('href').startswith('/tag/')] + raise ValueError(f'Could not fetch tags for unknown datasource: "{name}". Must be one of alignmentforum|lesswrong|eaforum') @dataclass @@ -212,79 +57,118 @@ class GreaterWrong(AlignmentDataset): base_url: str start_year: int min_karma: int + """Posts must have at least this much karma to be returned.""" + af: bool + """Whether alignment forum posts should be returned""" + limit = 50 COOLDOWN_TIME : float = 0.5 done_key = "url" + lazy_eval = True def setup(self): super().setup() logger.info(f"Grabbing most recent links (grabs all links if /{self.name}/urls/ is empty)...") self.skipped_urls = self.raw_data_path / self.name / 'skipped' - self.files_path = self.raw_data_path / self.name / 'urls' - self.files_path.mkdir(parents=True, exist_ok=True) - fetch_all_urls(self.base_url, self.files_path, self.start_year, self.COOLDOWN) logger.debug("Fetching ai tags...") - try: - self.ai_tags = set(fetch_ai_tags(self.base_url)) - except Exception: - raise ValueError('Could not fetch tags! Please retry') - - @property - def items_list(self): - logger.debug("Converting each link to a json with post & comments...") - if self.skipped_urls.exists(): - with open(self.skipped_urls) as f: - skipped = {l.strip() for l in f} - else: - skipped = [] + self.ai_tags = get_allowed_tags(self.base_url, self.name) - links = [] - for filename in self.files_path.glob('*'): - with jsonlines.open(filename) as reader: - links += [ - item for item in reader - if item.get('post_url') and item.get('score', 0) >= self.min_karma and item['post_url'] not in skipped - ] - return links + def tags_ok(self, post): + return not self.ai_tags or {t['name'] for t in post['tags']} & self.ai_tags def get_item_key(self, item): - return item['url'] - - def process_entry(self, item): - # Skip this if the request failed. The idea being that the next scrape will pick it up - post_url = item['post_url'] - try: - res = requests.get(post_url) - except requests.ConnectTimeout: - logger.error('Timeout while fetching %s - skipping for now', post_url) - return None + return item['pageUrl'] + + def make_query(self, after): + return """{ + posts(input: { + terms: { + excludeEvents: true + view: "old" + """ \ + f" af: {self.af}\n" \ + f" limit: {self.limit}\n" \ + f" karmaThreshold: {self.min_karma}\n" \ + f' after: "{after}"\n' \ + """ filter: "tagged" + } + }) { + totalCount + results { + _id + title + slug + pageUrl + postedAt + modifiedAt + score + extendedScore + baseScore + voteCount + commentCount + wordCount + tags { + name + } + user { + username + displayName + } + coauthors { + username + displayName + } + af + htmlBody + } + } + }""" + + def fetch_posts(self, query): + res = requests.post( + f'{self.base_url}/graphql', + # The GraphQL endpoint returns a 403 if the user agent isn't set... Makes sense, but is annoying + headers={'User-Agent': 'Mozilla /5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/113.0'}, + json={'query': query} + ) + return res.json()['data']['posts'] - if res.status_code != 200: - logger.error('Got status code of %s while fetching %s - skipping for now', res.status_code, post_url) - return None + @property + def items_list(self): + next_date = datetime.datetime(self.start_year, 1, 1).isoformat() + 'Z' + if self.jsonl_path.exists() and self.jsonl_path.lstat().st_size: + with jsonlines.open(self.jsonl_path) as f: + for item in f: + pass + next_date = item['date_published'] - html = res.text.replace("\u201c", '"').replace("\u201d", '"') - soup = BeautifulSoup(html, "html.parser") + while next_date: + posts = self.fetch_posts(self.make_query(next_date)) - post = soup.find('main', {'class': 'post'}) + for post in posts['results']: + if post['htmlBody'] and self.tags_ok(post): + yield post - title = post.find('h1') - meta_div = title.find_next_sibling('div') - metadata = extract_metadata(self.base_url, title, meta_div) + next_date = posts['results'][-1]['postedAt'] + if len(posts['results']) < 50: + return - # Skip this item if it doesn't have at least one AI tag - if not self.ai_tags & set(metadata.get('tags', [])): - with open(self.skipped_urls, 'a') as f: - f.write(post_url + '\n') - return None + time.sleep(self.COOLDOWN) - return DataEntry( - item, - text=markdownify(post.find('div', {'class': 'body-text'}).renderContents()), - comments=parse_comments(self.base_url, soup.find('div', {'id': 'comments'})), - source=self.name, - source_type='greaterwrong', - **metadata - ) + def process_entry(self, item): + return DataEntry({ + 'title': item['title'], + 'url': item['pageUrl'], + 'date_published': item['postedAt'], + 'modifiedAt': item['modifiedAt'], + 'text': markdownify(item['htmlBody']), + "source": self.name, + "source_type": "GreaterWrong", + 'votes': item['voteCount'], + 'karma': item['baseScore'], + 'tags': [t['name'] for t in item['tags']], + 'words': item['wordCount'], + 'authors': [item['user']] + item['coauthors'], + }) diff --git a/tests/align_data/test_greaterwrong.py b/tests/align_data/test_greaterwrong.py deleted file mode 100644 index c6fac7c6..00000000 --- a/tests/align_data/test_greaterwrong.py +++ /dev/null @@ -1,212 +0,0 @@ -import json -import pytest -import jsonlines -from pathlib import Path -from bs4 import BeautifulSoup - -from align_data.common.alignment_dataset import DataEntry -from align_data.greaterwrong.greaterwrong import ( - GreaterWrong, extract_author, get_attr, parse_karma, extract_metadata, fetch_month_urls, fetch_all_urls, parse_comments -) - - -def test_extract_author_with_valid_data(): - base_url = 'http://example.com' - html = 'John Doe' - soup = BeautifulSoup(html, 'html.parser') - a = soup.find('a') - - expected_result = { - 'fullName': 'John Doe', - 'userId': '12345', - 'userLink': 'http://example.com/user/12345', - 'name': 'John Doe', - } - assert extract_author(base_url, a) == expected_result - - -def test_extract_author_with_missing_data(): - base_url = 'http://example.com' - html = 'John Doe' - soup = BeautifulSoup(html, 'html.parser') - a = soup.find('a') - - expected_result = { - 'fullName': None, - 'userId': None, - 'userLink': 'http://example.com/user/12345', - 'name': 'John Doe', - } - assert extract_author(base_url, a) == expected_result - - -def test_extract_author_with_invalid_data(): - base_url = 'http://example.com' - html = 'John Doe' - soup = BeautifulSoup(html, 'html.parser') - a = soup.find('a') - - expected_result = { - 'fullName': None, - 'userId': None, - 'userLink': None, - 'name': 'John Doe', - } - assert extract_author(base_url, a) == expected_result - - -def test_extract_author_with_missing_base_url(): - base_url = None - html = 'John Doe' - soup = BeautifulSoup(html, 'html.parser') - a = soup.find('a') - - expected_result = { - 'fullName': 'John Doe', - 'userId': '12345', - 'userLink': None, - 'name': 'John Doe', - } - with pytest.raises(TypeError): - extract_author(base_url, a) - - -@pytest.mark.parametrize('div, selector, attr, expected', ( - # Test basic functionality - ('div', {'class': 'inner'}, 'id', 'target'), - ('div', {'class': 'inner'}, 'data-value', '123'), - - # missing tag selects anything - (None, {'class': 'inner'}, 'id', 'target'), - (None, {'class': 'non-existent'}, 'id', None), - - # Test missing attribute - ('div', {'class': 'inner'}, 'non-existent', None), - ('div', {'class': 'non-existent'}, 'id', None), -)) -def test_get_attr(div, selector, attr, expected): - html = ''' -
-
Some text
-
- ''' - soup = BeautifulSoup(html, 'html.parser') - outer = soup.find('div', {'class': 'outer'}) - - assert get_attr(outer, div, selector, attr) == expected - - -@pytest.mark.parametrize('attr, processor, expected', ( - (None, lambda v: v.text.upper(), 'SOME TEXT'), - (None, lambda v: 'ble ble ble', 'ble ble ble'), - - ('data-value', int, 123), - ('class', lambda v: v + ['bla bla'], ['inner', 'bla bla']), -)) -def test_get_attr_post_processing(attr, processor, expected): - html = ''' -
-
Some text
-
- ''' - soup = BeautifulSoup(html, 'html.parser') - outer = soup.find('div', {'class': 'outer'}) - - assert get_attr(outer, 'div', {'class': 'inner'}, attr, processor) == expected - - -@pytest.mark.parametrize('text, expected_karma, expected_score', ( - ('1 point', {'LW': 1}, 1), - ('432 points', {'LW': 432}, 432), - - ('LW: 32 AF: 42 EA: 12', {'LW': 32, 'AF': 42, 'EA': 12}, 32), - ('AF: 42 LW: 32 EA: 12', {'LW': 32, 'AF': 42, 'EA': 12}, 42), -)) -def test_parse_karma(text, expected_karma, expected_score): - html = f'''
- Less Wrong - {text} -
- ''' - meta_div = BeautifulSoup(html, 'html.parser') - score, karma = parse_karma(meta_div) - assert score == expected_score - assert karma == expected_karma - - -@pytest.mark.parametrize('html', ( - '
', - '''
- Less Wrong -
''', - '''
- Less Wrong - -
''', -)) -def test_parse_karma_missing_counts(html): - # Test case when Karma and site are None - meta_div = BeautifulSoup(html, 'html.parser') - score, karma = parse_karma(meta_div) - assert score == None - assert karma == {} - - -def test_extract_metadata(): - html = ''' -
-

Outside the Laboratory

-
- Eliezer Yudkowsky - 21 Jan 2007 3:46 UTC -
- - 132 points - -
351 comments - LW link - Archive - - - -
-
- ''' - post = BeautifulSoup(html, 'html.parser').find('h1') - assert extract_metadata('http://bla.bla', post, post.find_next_sibling('div')) == { - 'title': 'Outside the Laboratory', - 'url': 'https://www.lesswrong.com/posts/N2pENnTPB75sfc9kb/outside-the-laboratory', - 'authors': [{ - 'fullName': '', - 'userId': 'nmk3nLpQE89dMRzzN', - 'userLink': 'http://bla.bla/users/eliezer_yudkowsky', - 'name': 'Eliezer Yudkowsky' - }], - 'date_published': '2007-01-21T03:46:00', - 'votes': 108, - 'score': 132, - 'karma': {' LW ': 132}, - 'tags': ['Law-Thinking', 'Rationality', 'Practice & Philosophy of Science', 'Religion', 'Compartmentalization'] - } - - -def test_extract_metadata_remove_empty(): - html = '

Outside the Laboratory

' - post = BeautifulSoup(html, 'html.parser').find('h1') - assert extract_metadata('http://bla.bla', post) == {'title': 'Outside the Laboratory'}