diff --git a/align_data/common/alignment_dataset.py b/align_data/common/alignment_dataset.py
index 5d0632b2..71f1a608 100644
--- a/align_data/common/alignment_dataset.py
+++ b/align_data/common/alignment_dataset.py
@@ -52,6 +52,9 @@ class AlignmentDataset:
COOLDOWN = 0
"""An optional cool down between processing entries"""
+ lazy_eval = False
+ """Whether to lazy fetch items. This is nice in that it will start processing, but messes up the progress bar."""
+
# Internal housekeeping variables
_entry_idx = 0
"""Used internally for writing debugging info - each file write will increment it"""
@@ -142,7 +145,13 @@ def unprocessed_items(self, items=None):
def not_processed(item):
return self.get_item_key(item) not in self._outputted_items
- return tqdm(list(filter(not_processed, items or self.items_list)))
+ filtered = filter(not_processed, items or self.items_list)
+
+ # greedily fetch all items if not lazy eval. This makes the progress bar look nice
+ if not self.lazy_eval:
+ filtered = list(filtered)
+
+ return tqdm(filtered)
def fetch_entries(self):
"""Get all entries to be written to the file."""
diff --git a/align_data/greaterwrong/__init__.py b/align_data/greaterwrong/__init__.py
index 3bcc525f..8f4a079c 100644
--- a/align_data/greaterwrong/__init__.py
+++ b/align_data/greaterwrong/__init__.py
@@ -1,16 +1,25 @@
-from .greaterwrong import GreaterWrong
+from .greaterwrong import GreaterWrong, fetch_ea_forum_topics, fetch_LW_tags
GREATERWRONG_REGISTRY = [
GreaterWrong(
name="lesswrong",
- base_url='https://www.greaterwrong.com',
+ base_url='https://www.lesswrong.com',
start_year=2005,
min_karma=1,
+ af=False,
+ ),
+ GreaterWrong(
+ name="alignmentforum",
+ base_url='https://www.alignmentforum.org',
+ start_year=2009,
+ min_karma=1,
+ af=True,
),
GreaterWrong(
name="eaforum",
- base_url='https://ea.greaterwrong.com',
+ base_url='https://forum.effectivealtruism.org',
start_year=2011,
min_karma=1,
+ af=False,
)
]
diff --git a/align_data/greaterwrong/greaterwrong.py b/align_data/greaterwrong/greaterwrong.py
index a63ea140..0d45014f 100644
--- a/align_data/greaterwrong/greaterwrong.py
+++ b/align_data/greaterwrong/greaterwrong.py
@@ -6,199 +6,44 @@
import requests
import jsonlines
-from bs4 import BeautifulSoup, Tag
+from bs4 import BeautifulSoup
from tqdm import tqdm
from markdownify import markdownify
-from align_data.common.alignment_dataset import AlignmentDataset , DataEntry
+from align_data.common.alignment_dataset import AlignmentDataset, DataEntry
logger = logging.getLogger(__name__)
-def extract_author(base_url, a):
- return {
- 'fullName': a.attrs.get('data-full-name'),
- 'userId': a.attrs.get('data-userid'),
- 'userLink': a.attrs.get('href') and base_url + a.attrs.get('href'),
- 'name': a.text,
- }
-
-
-def get_attr(elem: Tag, tag: str, selector, attr=None, processor=lambda x: x):
- """A generic extractor of HTML info, which will also handle the item not existing.
-
- :param Tag elem: the element to search in
- :param str tag: the HTML tag to look for, e.g. `div`. Can also be `None`, in which case any tag will work
- :param dict selector: additional selector to drill down on, e.g. `{'class': 'bla'}`
- :param str attr: the attribute of the element to extract, e.g. 'href'. Ignored if `None`
- :param fn processor: an optional transformer to be run on the extracted value for postprocessing
- """
- item = elem.find(tag, selector)
- value = item
- if attr and item:
- value = item and item.get(attr)
- return value and processor(value)
-
-
-def parse_karma(meta_div: Tag):
- """Extract the karma from the given element.
-
- :param Tag meta_div: the element to be processed - this is the div containing url, karma, authors etc.
- :returns: a `(score, karma)` tuple, where `score` is the overall karma, while `karma` is a dict of per site karma
- """
- site = get_attr(meta_div, 'a', {'class': 'lw2-link'}, processor=lambda a: a.get('title') or next(a.children))
- karma_text = get_attr(meta_div, 'span', {'class': 'karma-value'}, processor=lambda d: d.text.strip())
- if not karma_text:
- score, karma = None, {}
- # In the case of this post only being on one server, the karma is provided as a string like "123 points"
- elif 'point' in karma_text:
- score = int(karma_text.split()[0].replace('−', '-'))
- karma = {site: score}
- # When it's e.g. an alignment forum post, it will have site specific karma, like "LW: 123, AF: 432"
- elif karma_text:
- parts = karma_text.replace(':', '').split()
- karma = {k: int(v) for k, v in zip(parts[::2], parts[1::2])}
- score = list(karma.values())[0]
- else:
- score, karma = None, {}
- return score, karma
-
-
-def extract_metadata(base_url: str, post: Tag, meta_div=None):
- """Extract the metadata of the post/comment.
-
- :param str base_url: the base url of the forum being used, e.g. 'https://lesswrong.com'
- :param Tag post: the HTML element to process
- :param Tag meta_div: used if the metadata is in multiple tags. Will use `post` if `None`
-
- :returns: a dict of extracted metadata values. Values that are empty will be removed
- """
- meta_div = meta_div or post
- score, karma = parse_karma(meta_div)
-
- metadata = {
- 'title': next(post.children).text,
- 'url': get_attr(meta_div, 'a', {'class': 'lw2-link'}, 'href'),
- 'post_url': get_attr(post, 'a', {'class': 'post-title-link'}, 'href', lambda url: base_url + url),
- 'link_post': get_attr(post, 'a', {'class': 'link-post-link'}, 'href'),
- 'authors': [extract_author(base_url, a) for a in meta_div.findChildren('a', {'class': 'author'})],
- 'date_published': get_attr(
- meta_div, None, {'class': 'date'},
- processor=lambda d: datetime.datetime.strptime(d.text.strip(), '%d %b %Y %H:%M %Z').isoformat()
- ),
- 'votes': get_attr(meta_div, 'span', {'class': 'karma-value'}, 'title', lambda v: int(v.split()[0])),
- 'score': score,
- 'karma': karma,
- 'tags': get_attr(meta_div, 'div', {'id': 'tags'}, processor=lambda d: [a.text.strip() for a in d.find_all('a')]),
- 'words': get_attr(meta_div, 'span', {'class': 'read-time'}, 'title'), #meta_div.find('span', {'class': 'read-time'}).attrs.get('title'),
- }
- return {k: v for k, v in metadata.items() if v}
-
-
-def fetch_month_urls(base_url: str, year: int, month: int, delay=1):
- """Fetch all posts from the given `year` and `month` from `base_url`.
-
- This will automatically paginate through all available pages.
- GreaterWrong has a limit of 2000 entries per pagination, which is why this is done per month
-
- To avoid clobbering the service, `delay` seconds will be waited between each network call.
-
- :returns: a list of metadata dicts for each post
- """
- all_posts = []
-
- url = f'/archive/{year}/{month}'
- while url:
- logger.debug('Fetching items for %s', url)
- res = requests.get(base_url + url)
- soup = BeautifulSoup(res.text, "html.parser")
-
- posts = soup.find_all('h1', {'class': 'listing'})
- all_posts += [extract_metadata(base_url, post, post.find_next_sibling('div')) for post in posts]
-
- url = soup.find('a', {'class': 'nav-item-next'})
- url = url and url.attrs.get('href').replace('#', '')
-
- time.sleep(delay)
-
- logger.debug('Found %s posts for %s/%s', len(all_posts), year, month)
- return all_posts
-
-
-def fetch_all_urls(base_url: str, urls_data_path: Path, start_year: int, delay=1):
- """Fetch the metadata of all posts from `base_url`, starting from `start_year`.
-
- This will create a separate data file for each month, starting from the earliest one checked. The resulting
- files will contain a JSON object per line containing the metadata of each post. If there were no posts in a
- give month (this happened in the beginning of LW, for example), then an empty file will be created to mark
- that month as checked. The latest month will always be rechecked, as it's most likely not up to date.
- """
- # Any url file that was created contains all urls for that month and so can be skipped. This
- # assumption only holds if post publication dates cannot be changed, and if posts won't retroactively
- # appear - both of which seem reasonable. Ignore the latest file though, as it probably won't contain
- # all urls for that given month
- known_urls = sorted(urls_data_path.glob('*'))[:-1]
-
- now = datetime.date.today()
- # Construct a big list of all months, rather than having nested loops, coz then
- # tqdm can show a nice loading bar
- dates = [
- (year, month)
- for year in range(start_year, now.year + 1)
- for month in range(1, 13)
- ]
- for year, month in tqdm(dates):
- data_file = urls_data_path / f'{year}_{month}.jsonl'
-
- if data_file in known_urls:
- logger.debug(f'Already processed {data_file.name} - skipping')
- continue
-
- try:
- posts = fetch_month_urls(base_url, year, month, delay)
- except Exception as e:
- logger.error(e)
- else:
- with jsonlines.open(data_file , mode='w') as writer:
- writer.write_all(posts)
-
- # No point in looking for future posts...
- if year == now.year and month == now.month:
- break
-
-
-def parse_comments(base_url: str, elem: Tag):
- """Recursively extract the whole comment tree from the given HTML `elem`."""
- if not elem or not elem.get('class'):
- return None
- if 'comment-thread' in elem.get('class') or 'comments' in elem.get('class'):
- return list(filter(None, map(lambda x: parse_comments(base_url, x), elem.children)))
- if 'comment-item' in elem.get('class'):
- comment = elem.find('div', {'class': 'comment'})
- if 'deleted-comment' in comment.get('class'):
- return None
+def fetch_LW_tags(url):
+ res = requests.get(
+ url + '/tag/ai',
+ headers={'User-Agent': 'Mozilla /5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/113.0'},
+ )
+ soup = BeautifulSoup(res.content, "html.parser")
+ container = soup.find('div', {'class': 'TagPage-description'}).find('table')
+ return {a.text.strip() for a in container.find_all('a') if '/tag/' in a.get('href')}
- metadata = extract_metadata(base_url, comment)
- return {
- 'text': comment.find('div', {'class': 'body-text'}).text,
- 'votes': metadata.get('votes'),
- 'score': metadata.get('score'),
- 'karma': metadata.get('karma'),
- 'url': metadata.get('url'),
- 'date_published': metadata['date_published'],
- 'author': metadata.get('authors', [{}])[0].get('name'),
- 'comments': parse_comments(base_url, elem.find('ul', {'class': 'comment-thread'})),
- }
+def fetch_ea_forum_topics(url):
+ res = requests.get(url + '/topics/ai-safety')
+ soup = BeautifulSoup(res.content, "html.parser")
+ container = soup.find('div', {'class': 'SidebarSubtagsBox-root'})
+ return {a.text.strip() for a in container.find_all('a') if '/topics/' in a.get('href', '')}
- return None
+def get_allowed_tags(url, name):
+ if name == 'alignmentforum':
+ return set()
+ try:
+ if name == 'lesswrong':
+ return fetch_LW_tags(url)
+ if name == 'eaforum':
+ return fetch_ea_forum_topics(url)
+ except Exception:
+ raise ValueError('Could not fetch tags! Please retry')
-def fetch_ai_tags(url):
- res = requests.get(url + '/tag/ai')
- soup = BeautifulSoup(res.content, "html.parser")
- container = soup.find('div', {'class': 'tag-description'}).find('table')
- return [a.text.strip() for a in container.find_all('a') if a.get('href').startswith('/tag/')]
+ raise ValueError(f'Could not fetch tags for unknown datasource: "{name}". Must be one of alignmentforum|lesswrong|eaforum')
@dataclass
@@ -212,79 +57,118 @@ class GreaterWrong(AlignmentDataset):
base_url: str
start_year: int
min_karma: int
+ """Posts must have at least this much karma to be returned."""
+ af: bool
+ """Whether alignment forum posts should be returned"""
+ limit = 50
COOLDOWN_TIME : float = 0.5
done_key = "url"
+ lazy_eval = True
def setup(self):
super().setup()
logger.info(f"Grabbing most recent links (grabs all links if /{self.name}/urls/ is empty)...")
self.skipped_urls = self.raw_data_path / self.name / 'skipped'
- self.files_path = self.raw_data_path / self.name / 'urls'
- self.files_path.mkdir(parents=True, exist_ok=True)
- fetch_all_urls(self.base_url, self.files_path, self.start_year, self.COOLDOWN)
logger.debug("Fetching ai tags...")
- try:
- self.ai_tags = set(fetch_ai_tags(self.base_url))
- except Exception:
- raise ValueError('Could not fetch tags! Please retry')
-
- @property
- def items_list(self):
- logger.debug("Converting each link to a json with post & comments...")
- if self.skipped_urls.exists():
- with open(self.skipped_urls) as f:
- skipped = {l.strip() for l in f}
- else:
- skipped = []
+ self.ai_tags = get_allowed_tags(self.base_url, self.name)
- links = []
- for filename in self.files_path.glob('*'):
- with jsonlines.open(filename) as reader:
- links += [
- item for item in reader
- if item.get('post_url') and item.get('score', 0) >= self.min_karma and item['post_url'] not in skipped
- ]
- return links
+ def tags_ok(self, post):
+ return not self.ai_tags or {t['name'] for t in post['tags']} & self.ai_tags
def get_item_key(self, item):
- return item['url']
-
- def process_entry(self, item):
- # Skip this if the request failed. The idea being that the next scrape will pick it up
- post_url = item['post_url']
- try:
- res = requests.get(post_url)
- except requests.ConnectTimeout:
- logger.error('Timeout while fetching %s - skipping for now', post_url)
- return None
+ return item['pageUrl']
+
+ def make_query(self, after):
+ return """{
+ posts(input: {
+ terms: {
+ excludeEvents: true
+ view: "old"
+ """ \
+ f" af: {self.af}\n" \
+ f" limit: {self.limit}\n" \
+ f" karmaThreshold: {self.min_karma}\n" \
+ f' after: "{after}"\n' \
+ """ filter: "tagged"
+ }
+ }) {
+ totalCount
+ results {
+ _id
+ title
+ slug
+ pageUrl
+ postedAt
+ modifiedAt
+ score
+ extendedScore
+ baseScore
+ voteCount
+ commentCount
+ wordCount
+ tags {
+ name
+ }
+ user {
+ username
+ displayName
+ }
+ coauthors {
+ username
+ displayName
+ }
+ af
+ htmlBody
+ }
+ }
+ }"""
+
+ def fetch_posts(self, query):
+ res = requests.post(
+ f'{self.base_url}/graphql',
+ # The GraphQL endpoint returns a 403 if the user agent isn't set... Makes sense, but is annoying
+ headers={'User-Agent': 'Mozilla /5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/113.0'},
+ json={'query': query}
+ )
+ return res.json()['data']['posts']
- if res.status_code != 200:
- logger.error('Got status code of %s while fetching %s - skipping for now', res.status_code, post_url)
- return None
+ @property
+ def items_list(self):
+ next_date = datetime.datetime(self.start_year, 1, 1).isoformat() + 'Z'
+ if self.jsonl_path.exists() and self.jsonl_path.lstat().st_size:
+ with jsonlines.open(self.jsonl_path) as f:
+ for item in f:
+ pass
+ next_date = item['date_published']
- html = res.text.replace("\u201c", '"').replace("\u201d", '"')
- soup = BeautifulSoup(html, "html.parser")
+ while next_date:
+ posts = self.fetch_posts(self.make_query(next_date))
- post = soup.find('main', {'class': 'post'})
+ for post in posts['results']:
+ if post['htmlBody'] and self.tags_ok(post):
+ yield post
- title = post.find('h1')
- meta_div = title.find_next_sibling('div')
- metadata = extract_metadata(self.base_url, title, meta_div)
+ next_date = posts['results'][-1]['postedAt']
+ if len(posts['results']) < 50:
+ return
- # Skip this item if it doesn't have at least one AI tag
- if not self.ai_tags & set(metadata.get('tags', [])):
- with open(self.skipped_urls, 'a') as f:
- f.write(post_url + '\n')
- return None
+ time.sleep(self.COOLDOWN)
- return DataEntry(
- item,
- text=markdownify(post.find('div', {'class': 'body-text'}).renderContents()),
- comments=parse_comments(self.base_url, soup.find('div', {'id': 'comments'})),
- source=self.name,
- source_type='greaterwrong',
- **metadata
- )
+ def process_entry(self, item):
+ return DataEntry({
+ 'title': item['title'],
+ 'url': item['pageUrl'],
+ 'date_published': item['postedAt'],
+ 'modifiedAt': item['modifiedAt'],
+ 'text': markdownify(item['htmlBody']),
+ "source": self.name,
+ "source_type": "GreaterWrong",
+ 'votes': item['voteCount'],
+ 'karma': item['baseScore'],
+ 'tags': [t['name'] for t in item['tags']],
+ 'words': item['wordCount'],
+ 'authors': [item['user']] + item['coauthors'],
+ })
diff --git a/tests/align_data/test_greaterwrong.py b/tests/align_data/test_greaterwrong.py
deleted file mode 100644
index c6fac7c6..00000000
--- a/tests/align_data/test_greaterwrong.py
+++ /dev/null
@@ -1,212 +0,0 @@
-import json
-import pytest
-import jsonlines
-from pathlib import Path
-from bs4 import BeautifulSoup
-
-from align_data.common.alignment_dataset import DataEntry
-from align_data.greaterwrong.greaterwrong import (
- GreaterWrong, extract_author, get_attr, parse_karma, extract_metadata, fetch_month_urls, fetch_all_urls, parse_comments
-)
-
-
-def test_extract_author_with_valid_data():
- base_url = 'http://example.com'
- html = 'John Doe'
- soup = BeautifulSoup(html, 'html.parser')
- a = soup.find('a')
-
- expected_result = {
- 'fullName': 'John Doe',
- 'userId': '12345',
- 'userLink': 'http://example.com/user/12345',
- 'name': 'John Doe',
- }
- assert extract_author(base_url, a) == expected_result
-
-
-def test_extract_author_with_missing_data():
- base_url = 'http://example.com'
- html = 'John Doe'
- soup = BeautifulSoup(html, 'html.parser')
- a = soup.find('a')
-
- expected_result = {
- 'fullName': None,
- 'userId': None,
- 'userLink': 'http://example.com/user/12345',
- 'name': 'John Doe',
- }
- assert extract_author(base_url, a) == expected_result
-
-
-def test_extract_author_with_invalid_data():
- base_url = 'http://example.com'
- html = 'John Doe'
- soup = BeautifulSoup(html, 'html.parser')
- a = soup.find('a')
-
- expected_result = {
- 'fullName': None,
- 'userId': None,
- 'userLink': None,
- 'name': 'John Doe',
- }
- assert extract_author(base_url, a) == expected_result
-
-
-def test_extract_author_with_missing_base_url():
- base_url = None
- html = 'John Doe'
- soup = BeautifulSoup(html, 'html.parser')
- a = soup.find('a')
-
- expected_result = {
- 'fullName': 'John Doe',
- 'userId': '12345',
- 'userLink': None,
- 'name': 'John Doe',
- }
- with pytest.raises(TypeError):
- extract_author(base_url, a)
-
-
-@pytest.mark.parametrize('div, selector, attr, expected', (
- # Test basic functionality
- ('div', {'class': 'inner'}, 'id', 'target'),
- ('div', {'class': 'inner'}, 'data-value', '123'),
-
- # missing tag selects anything
- (None, {'class': 'inner'}, 'id', 'target'),
- (None, {'class': 'non-existent'}, 'id', None),
-
- # Test missing attribute
- ('div', {'class': 'inner'}, 'non-existent', None),
- ('div', {'class': 'non-existent'}, 'id', None),
-))
-def test_get_attr(div, selector, attr, expected):
- html = '''
-
- '''
- soup = BeautifulSoup(html, 'html.parser')
- outer = soup.find('div', {'class': 'outer'})
-
- assert get_attr(outer, div, selector, attr) == expected
-
-
-@pytest.mark.parametrize('attr, processor, expected', (
- (None, lambda v: v.text.upper(), 'SOME TEXT'),
- (None, lambda v: 'ble ble ble', 'ble ble ble'),
-
- ('data-value', int, 123),
- ('class', lambda v: v + ['bla bla'], ['inner', 'bla bla']),
-))
-def test_get_attr_post_processing(attr, processor, expected):
- html = '''
-
- '''
- soup = BeautifulSoup(html, 'html.parser')
- outer = soup.find('div', {'class': 'outer'})
-
- assert get_attr(outer, 'div', {'class': 'inner'}, attr, processor) == expected
-
-
-@pytest.mark.parametrize('text, expected_karma, expected_score', (
- ('1 point', {'LW': 1}, 1),
- ('432 points', {'LW': 432}, 432),
-
- ('LW: 32 AF: 42 EA: 12', {'LW': 32, 'AF': 42, 'EA': 12}, 32),
- ('AF: 42 LW: 32 EA: 12', {'LW': 32, 'AF': 42, 'EA': 12}, 42),
-))
-def test_parse_karma(text, expected_karma, expected_score):
- html = f'''
- '''
- meta_div = BeautifulSoup(html, 'html.parser')
- score, karma = parse_karma(meta_div)
- assert score == expected_score
- assert karma == expected_karma
-
-
-@pytest.mark.parametrize('html', (
- '',
- '''''',
- '''''',
-))
-def test_parse_karma_missing_counts(html):
- # Test case when Karma and site are None
- meta_div = BeautifulSoup(html, 'html.parser')
- score, karma = parse_karma(meta_div)
- assert score == None
- assert karma == {}
-
-
-def test_extract_metadata():
- html = '''
-
- Outside the Laboratory
-
-
- '''
- post = BeautifulSoup(html, 'html.parser').find('h1')
- assert extract_metadata('http://bla.bla', post, post.find_next_sibling('div')) == {
- 'title': 'Outside the Laboratory',
- 'url': 'https://www.lesswrong.com/posts/N2pENnTPB75sfc9kb/outside-the-laboratory',
- 'authors': [{
- 'fullName': '',
- 'userId': 'nmk3nLpQE89dMRzzN',
- 'userLink': 'http://bla.bla/users/eliezer_yudkowsky',
- 'name': 'Eliezer Yudkowsky'
- }],
- 'date_published': '2007-01-21T03:46:00',
- 'votes': 108,
- 'score': 132,
- 'karma': {' LW ': 132},
- 'tags': ['Law-Thinking', 'Rationality', 'Practice & Philosophy of Science', 'Religion', 'Compartmentalization']
- }
-
-
-def test_extract_metadata_remove_empty():
- html = ' Outside the Laboratory
'
- post = BeautifulSoup(html, 'html.parser').find('h1')
- assert extract_metadata('http://bla.bla', post) == {'title': 'Outside the Laboratory'}