diff --git a/align_data/common/alignment_dataset.py b/align_data/common/alignment_dataset.py
index 5d0632b2..71f1a608 100644
--- a/align_data/common/alignment_dataset.py
+++ b/align_data/common/alignment_dataset.py
@@ -52,6 +52,9 @@ class AlignmentDataset:
     COOLDOWN = 0
     """An optional cool down between processing entries"""
 
+    lazy_eval = False
+    """Whether to lazy fetch items. This is nice in that it will start processing, but messes up the progress bar."""
+
     # Internal housekeeping variables
     _entry_idx = 0
     """Used internally for writing debugging info - each file write will increment it"""
@@ -142,7 +145,13 @@ def unprocessed_items(self, items=None):
         def not_processed(item):
             return self.get_item_key(item) not in self._outputted_items
 
-        return tqdm(list(filter(not_processed, items or self.items_list)))
+        filtered = filter(not_processed, items or self.items_list)
+
+        # greedily fetch all items if not lazy eval. This makes the progress bar look nice
+        if not self.lazy_eval:
+            filtered = list(filtered)
+
+        return tqdm(filtered)
 
     def fetch_entries(self):
         """Get all entries to be written to the file."""
diff --git a/align_data/greaterwrong/__init__.py b/align_data/greaterwrong/__init__.py
index 3bcc525f..8f4a079c 100644
--- a/align_data/greaterwrong/__init__.py
+++ b/align_data/greaterwrong/__init__.py
@@ -1,16 +1,25 @@
-from .greaterwrong import GreaterWrong
+from .greaterwrong import GreaterWrong, fetch_ea_forum_topics, fetch_LW_tags
 
 GREATERWRONG_REGISTRY = [
     GreaterWrong(
         name="lesswrong",
-        base_url='https://www.greaterwrong.com',
+        base_url='https://www.lesswrong.com',
         start_year=2005,
         min_karma=1,
+        af=False,
+    ),
+    GreaterWrong(
+        name="alignmentforum",
+        base_url='https://www.alignmentforum.org',
+        start_year=2009,
+        min_karma=1,
+        af=True,
     ),
     GreaterWrong(
         name="eaforum",
-        base_url='https://ea.greaterwrong.com',
+        base_url='https://forum.effectivealtruism.org',
         start_year=2011,
         min_karma=1,
+        af=False,
     )
 ]
diff --git a/align_data/greaterwrong/greaterwrong.py b/align_data/greaterwrong/greaterwrong.py
index a63ea140..0d45014f 100644
--- a/align_data/greaterwrong/greaterwrong.py
+++ b/align_data/greaterwrong/greaterwrong.py
@@ -6,199 +6,44 @@
 
 import requests
 import jsonlines
-from bs4 import BeautifulSoup, Tag
+from bs4 import BeautifulSoup
 from tqdm import tqdm
 from markdownify import markdownify
 
-from align_data.common.alignment_dataset import AlignmentDataset , DataEntry
+from align_data.common.alignment_dataset import AlignmentDataset, DataEntry
 
 logger = logging.getLogger(__name__)
 
 
-def extract_author(base_url, a):
-    return {
-        'fullName': a.attrs.get('data-full-name'),
-        'userId': a.attrs.get('data-userid'),
-        'userLink': a.attrs.get('href') and base_url + a.attrs.get('href'),
-        'name': a.text,
-    }
-
-
-def get_attr(elem: Tag, tag: str, selector, attr=None, processor=lambda x: x):
-    """A generic extractor of HTML info, which will also handle the item not existing.
-
-    :param Tag elem: the element to search in
-    :param str tag: the HTML tag to look for, e.g. `div`. Can also be `None`, in which case any tag will work
-    :param dict selector: additional selector to drill down on, e.g. `{'class': 'bla'}`
-    :param str attr: the attribute of the element to extract, e.g. 'href'. Ignored if `None`
-    :param fn processor: an optional transformer to be run on the extracted value for postprocessing
-    """
-    item = elem.find(tag, selector)
-    value = item
-    if attr and item:
-        value = item and item.get(attr)
-    return value and processor(value)
-
-
-def parse_karma(meta_div: Tag):
-    """Extract the karma from the given element.
-
-    :param Tag meta_div: the element to be processed - this is the div containing url, karma, authors etc.
-    :returns: a `(score, karma)` tuple, where `score` is the overall karma, while `karma` is a dict of per site karma
-    """
-    site = get_attr(meta_div, 'a', {'class': 'lw2-link'}, processor=lambda a: a.get('title') or next(a.children))
-    karma_text = get_attr(meta_div, 'span', {'class': 'karma-value'}, processor=lambda d: d.text.strip())
-    if not karma_text:
-        score, karma = None, {}
-    # In the case of this post only being on one server, the karma is provided as a string like "123 points"
-    elif 'point' in karma_text:
-        score = int(karma_text.split()[0].replace('−', '-'))
-        karma = {site: score}
-    # When it's e.g. an alignment forum post, it will have site specific karma, like "LW: 123, AF: 432"
-    elif karma_text:
-        parts = karma_text.replace(':', '').split()
-        karma = {k: int(v) for k, v in zip(parts[::2], parts[1::2])}
-        score = list(karma.values())[0]
-    else:
-        score, karma = None, {}
-    return score, karma
-
-
-def extract_metadata(base_url: str, post: Tag, meta_div=None):
-    """Extract the metadata of the post/comment.
-
-    :param str base_url: the base url of the forum being used, e.g. 'https://lesswrong.com'
-    :param Tag post: the HTML element to process
-    :param Tag meta_div: used if the metadata is in multiple tags. Will use `post` if `None`
-
-    :returns: a dict of extracted metadata values. Values that are empty will be removed
-    """
-    meta_div = meta_div or post
-    score, karma = parse_karma(meta_div)
-
-    metadata = {
-        'title': next(post.children).text,
-        'url': get_attr(meta_div, 'a', {'class': 'lw2-link'}, 'href'),
-        'post_url': get_attr(post, 'a', {'class': 'post-title-link'}, 'href', lambda url: base_url + url),
-        'link_post': get_attr(post, 'a', {'class': 'link-post-link'}, 'href'),
-        'authors': [extract_author(base_url, a) for a in meta_div.findChildren('a', {'class': 'author'})],
-        'date_published': get_attr(
-            meta_div, None, {'class': 'date'},
-            processor=lambda d: datetime.datetime.strptime(d.text.strip(), '%d %b %Y %H:%M %Z').isoformat()
-        ),
-        'votes': get_attr(meta_div, 'span', {'class': 'karma-value'}, 'title', lambda v: int(v.split()[0])),
-        'score': score,
-        'karma': karma,
-        'tags': get_attr(meta_div, 'div', {'id': 'tags'}, processor=lambda d: [a.text.strip() for a in d.find_all('a')]),
-        'words': get_attr(meta_div, 'span', {'class': 'read-time'}, 'title'), #meta_div.find('span', {'class': 'read-time'}).attrs.get('title'),
-    }
-    return {k: v for k, v in metadata.items() if v}
-
-
-def fetch_month_urls(base_url: str, year: int, month: int, delay=1):
-    """Fetch all posts from the given `year` and `month` from `base_url`.
-
-    This will automatically paginate through all available pages.
-    GreaterWrong has a limit of 2000 entries per pagination, which is why this is done per month
-
-    To avoid clobbering the service, `delay` seconds will be waited between each network call.
-
-    :returns: a list of metadata dicts for each post
-    """
-    all_posts = []
-
-    url = f'/archive/{year}/{month}'
-    while url:
-        logger.debug('Fetching items for %s', url)
-        res = requests.get(base_url + url)
-        soup = BeautifulSoup(res.text, "html.parser")
-
-        posts = soup.find_all('h1', {'class': 'listing'})
-        all_posts += [extract_metadata(base_url, post, post.find_next_sibling('div')) for post in posts]
-
-        url = soup.find('a', {'class': 'nav-item-next'})
-        url = url and url.attrs.get('href').replace('#', '')
-
-        time.sleep(delay)
-
-    logger.debug('Found %s posts for %s/%s', len(all_posts), year, month)
-    return all_posts
-
-
-def fetch_all_urls(base_url: str, urls_data_path: Path, start_year: int, delay=1):
-    """Fetch the metadata of all posts from `base_url`, starting from `start_year`.
-
-    This will create a separate data file for each month, starting from the earliest one checked. The resulting
-    files will contain a JSON object per line containing the metadata of each post. If there were no posts in a
-    give month (this happened in the beginning of LW, for example), then an empty file will be created to mark
-    that month as checked. The latest month will always be rechecked, as it's most likely not up to date.
-    """
-    # Any url file that was created contains all urls for that month and so can be skipped. This
-    # assumption only holds if post publication dates cannot be changed, and if posts won't retroactively
-    # appear - both of which seem reasonable. Ignore the latest file though, as it probably won't contain
-    # all urls for that given month
-    known_urls = sorted(urls_data_path.glob('*'))[:-1]
-
-    now = datetime.date.today()
-    # Construct a big list of all months, rather than having nested loops, coz then
-    # tqdm can show a nice loading bar
-    dates = [
-        (year, month)
-        for year in range(start_year, now.year + 1)
-        for month in range(1, 13)
-    ]
-    for year, month in tqdm(dates):
-        data_file = urls_data_path / f'{year}_{month}.jsonl'
-
-        if data_file in known_urls:
-            logger.debug(f'Already processed {data_file.name} - skipping')
-            continue
-
-        try:
-            posts = fetch_month_urls(base_url, year, month, delay)
-        except Exception as e:
-            logger.error(e)
-        else:
-            with jsonlines.open(data_file , mode='w') as writer:
-                writer.write_all(posts)
-
-        # No point in looking for future posts...
-        if year == now.year and month == now.month:
-            break
-
-
-def parse_comments(base_url: str, elem: Tag):
-    """Recursively extract the whole comment tree from the given HTML `elem`."""
-    if not elem or not elem.get('class'):
-        return None
-    if 'comment-thread' in elem.get('class') or 'comments' in elem.get('class'):
-        return list(filter(None, map(lambda x: parse_comments(base_url, x), elem.children)))
-    if 'comment-item' in elem.get('class'):
-        comment = elem.find('div', {'class': 'comment'})
-        if 'deleted-comment' in comment.get('class'):
-            return None
+def fetch_LW_tags(url):
+    res = requests.get(
+        url + '/tag/ai',
+        headers={'User-Agent': 'Mozilla /5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/113.0'},
+    )
+    soup = BeautifulSoup(res.content, "html.parser")
+    container = soup.find('div', {'class': 'TagPage-description'}).find('table')
+    return {a.text.strip() for a in container.find_all('a') if '/tag/' in a.get('href')}
 
-        metadata = extract_metadata(base_url, comment)
 
-        return {
-            'text': comment.find('div', {'class': 'body-text'}).text,
-            'votes': metadata.get('votes'),
-            'score': metadata.get('score'),
-            'karma': metadata.get('karma'),
-            'url': metadata.get('url'),
-            'date_published': metadata['date_published'],
-            'author': metadata.get('authors', [{}])[0].get('name'),
-            'comments': parse_comments(base_url, elem.find('ul', {'class': 'comment-thread'})),
-        }
+def fetch_ea_forum_topics(url):
+    res = requests.get(url + '/topics/ai-safety')
+    soup = BeautifulSoup(res.content, "html.parser")
+    container = soup.find('div', {'class': 'SidebarSubtagsBox-root'})
+    return {a.text.strip() for a in container.find_all('a') if '/topics/' in a.get('href', '')}
 
-    return None
 
+def get_allowed_tags(url, name):
+    if name == 'alignmentforum':
+        return set()
+    try:
+        if name == 'lesswrong':
+            return fetch_LW_tags(url)
+        if name == 'eaforum':
+            return fetch_ea_forum_topics(url)
+    except Exception:
+        raise ValueError('Could not fetch tags! Please retry')
 
-def fetch_ai_tags(url):
-    res = requests.get(url + '/tag/ai')
-    soup = BeautifulSoup(res.content, "html.parser")
-    container = soup.find('div', {'class': 'tag-description'}).find('table')
-    return [a.text.strip() for a in container.find_all('a') if a.get('href').startswith('/tag/')]
+    raise ValueError(f'Could not fetch tags for unknown datasource: "{name}". Must be one of alignmentforum|lesswrong|eaforum')
 
 
 @dataclass
@@ -212,79 +57,118 @@ class GreaterWrong(AlignmentDataset):
     base_url: str
     start_year: int
     min_karma: int
+    """Posts must have at least this much karma to be returned."""
+    af: bool
+    """Whether alignment forum posts should be returned"""
 
+    limit = 50
     COOLDOWN_TIME : float = 0.5
     done_key = "url"
+    lazy_eval = True
 
     def setup(self):
         super().setup()
 
         logger.info(f"Grabbing most recent links (grabs all links if /{self.name}/urls/ is empty)...")
         self.skipped_urls = self.raw_data_path / self.name / 'skipped'
-        self.files_path = self.raw_data_path / self.name / 'urls'
-        self.files_path.mkdir(parents=True, exist_ok=True)
-        fetch_all_urls(self.base_url, self.files_path, self.start_year, self.COOLDOWN)
 
         logger.debug("Fetching ai tags...")
-        try:
-            self.ai_tags = set(fetch_ai_tags(self.base_url))
-        except Exception:
-            raise ValueError('Could not fetch tags! Please retry')
-
-    @property
-    def items_list(self):
-        logger.debug("Converting each link to a json with post & comments...")
-        if self.skipped_urls.exists():
-            with open(self.skipped_urls) as f:
-                skipped = {l.strip() for l in f}
-        else:
-            skipped = []
+        self.ai_tags = get_allowed_tags(self.base_url, self.name)
 
-        links = []
-        for filename in self.files_path.glob('*'):
-            with jsonlines.open(filename) as reader:
-                links += [
-                    item for item in reader
-                    if item.get('post_url') and item.get('score', 0) >= self.min_karma and item['post_url'] not in skipped
-                ]
-        return links
+    def tags_ok(self, post):
+        return not self.ai_tags or {t['name'] for t in post['tags']} & self.ai_tags
 
     def get_item_key(self, item):
-        return item['url']
-
-    def process_entry(self, item):
-        # Skip this if the request failed. The idea being that the next scrape will pick it up
-        post_url = item['post_url']
-        try:
-            res = requests.get(post_url)
-        except requests.ConnectTimeout:
-            logger.error('Timeout while fetching %s - skipping for now', post_url)
-            return None
+        return item['pageUrl']
+
+    def make_query(self, after):
+        return """{
+            posts(input: {
+            terms: {
+                excludeEvents: true
+                view: "old"
+        """ \
+        f"      af: {self.af}\n" \
+        f"      limit: {self.limit}\n" \
+        f"      karmaThreshold: {self.min_karma}\n" \
+        f'        after: "{after}"\n' \
+        """        filter: "tagged"
+            }
+            }) {
+            totalCount
+            results {
+                _id
+                title
+                slug
+                pageUrl
+                postedAt
+                modifiedAt
+                score
+                extendedScore
+                baseScore
+                voteCount
+                commentCount
+                wordCount
+                tags {
+                name
+                }
+                user {
+                username
+                displayName
+                }
+                coauthors {
+                username
+                displayName
+                }
+                af
+                htmlBody
+            }
+            }
+        }"""
+
+    def fetch_posts(self, query):
+        res = requests.post(
+            f'{self.base_url}/graphql',
+            # The GraphQL endpoint returns a 403 if the user agent isn't set... Makes sense, but is annoying
+            headers={'User-Agent': 'Mozilla /5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/113.0'},
+            json={'query': query}
+        )
+        return res.json()['data']['posts']
 
-        if res.status_code != 200:
-            logger.error('Got status code of %s while fetching %s - skipping for now', res.status_code, post_url)
-            return None
+    @property
+    def items_list(self):
+        next_date = datetime.datetime(self.start_year, 1, 1).isoformat() + 'Z'
+        if self.jsonl_path.exists() and self.jsonl_path.lstat().st_size:
+            with jsonlines.open(self.jsonl_path) as f:
+                for item in f:
+                    pass
+                next_date = item['date_published']
 
-        html = res.text.replace("\u201c", '"').replace("\u201d", '"')
-        soup = BeautifulSoup(html, "html.parser")
+        while next_date:
+            posts = self.fetch_posts(self.make_query(next_date))
 
-        post = soup.find('main', {'class': 'post'})
+            for post in posts['results']:
+                if post['htmlBody'] and self.tags_ok(post):
+                    yield post
 
-        title = post.find('h1')
-        meta_div = title.find_next_sibling('div')
-        metadata = extract_metadata(self.base_url, title, meta_div)
+            next_date = posts['results'][-1]['postedAt']
+            if len(posts['results']) < 50:
+                return
 
-        # Skip this item if it doesn't have at least one AI tag
-        if not self.ai_tags & set(metadata.get('tags', [])):
-            with open(self.skipped_urls, 'a') as f:
-                f.write(post_url + '\n')
-            return None
+            time.sleep(self.COOLDOWN)
 
-        return DataEntry(
-            item,
-            text=markdownify(post.find('div', {'class': 'body-text'}).renderContents()),
-            comments=parse_comments(self.base_url, soup.find('div', {'id': 'comments'})),
-            source=self.name,
-            source_type='greaterwrong',
-            **metadata
-        )
+    def process_entry(self, item):
+        return DataEntry({
+            'title': item['title'],
+            'url': item['pageUrl'],
+            'date_published': item['postedAt'],
+            'modifiedAt': item['modifiedAt'],
+            'text': markdownify(item['htmlBody']),
+            "source": self.name,
+            "source_type": "GreaterWrong",
+            'votes': item['voteCount'],
+            'karma': item['baseScore'],
+            'tags': [t['name'] for t in item['tags']],
+            'words': item['wordCount'],
+            'authors': [item['user']] + item['coauthors'],
+        })
diff --git a/tests/align_data/test_greaterwrong.py b/tests/align_data/test_greaterwrong.py
deleted file mode 100644
index c6fac7c6..00000000
--- a/tests/align_data/test_greaterwrong.py
+++ /dev/null
@@ -1,212 +0,0 @@
-import json
-import pytest
-import jsonlines
-from pathlib import Path
-from bs4 import BeautifulSoup
-
-from align_data.common.alignment_dataset import DataEntry
-from align_data.greaterwrong.greaterwrong import (
-    GreaterWrong, extract_author, get_attr, parse_karma, extract_metadata, fetch_month_urls, fetch_all_urls, parse_comments
-)
-
-
-def test_extract_author_with_valid_data():
-    base_url = 'http://example.com'
-    html = '<a href="/user/12345" data-full-name="John Doe" data-userid="12345">John Doe</a>'
-    soup = BeautifulSoup(html, 'html.parser')
-    a = soup.find('a')
-
-    expected_result = {
-        'fullName': 'John Doe',
-        'userId': '12345',
-        'userLink': 'http://example.com/user/12345',
-        'name': 'John Doe',
-    }
-    assert extract_author(base_url, a) == expected_result
-
-
-def test_extract_author_with_missing_data():
-    base_url = 'http://example.com'
-    html = '<a href="/user/12345">John Doe</a>'
-    soup = BeautifulSoup(html, 'html.parser')
-    a = soup.find('a')
-
-    expected_result = {
-        'fullName': None,
-        'userId': None,
-        'userLink': 'http://example.com/user/12345',
-        'name': 'John Doe',
-    }
-    assert extract_author(base_url, a) == expected_result
-
-
-def test_extract_author_with_invalid_data():
-    base_url = 'http://example.com'
-    html = '<a>John Doe</a>'
-    soup = BeautifulSoup(html, 'html.parser')
-    a = soup.find('a')
-
-    expected_result = {
-        'fullName': None,
-        'userId': None,
-        'userLink': None,
-        'name': 'John Doe',
-    }
-    assert extract_author(base_url, a) == expected_result
-
-
-def test_extract_author_with_missing_base_url():
-    base_url = None
-    html = '<a href="/user/12345" data-full-name="John Doe" data-userid="12345">John Doe</a>'
-    soup = BeautifulSoup(html, 'html.parser')
-    a = soup.find('a')
-
-    expected_result = {
-        'fullName': 'John Doe',
-        'userId': '12345',
-        'userLink': None,
-        'name': 'John Doe',
-    }
-    with pytest.raises(TypeError):
-        extract_author(base_url, a)
-
-
-@pytest.mark.parametrize('div, selector, attr, expected', (
-    # Test basic functionality
-    ('div', {'class': 'inner'}, 'id', 'target'),
-    ('div', {'class': 'inner'}, 'data-value', '123'),
-
-    # missing tag selects anything
-    (None, {'class': 'inner'}, 'id', 'target'),
-    (None, {'class': 'non-existent'}, 'id', None),
-
-    # Test missing attribute
-    ('div', {'class': 'inner'}, 'non-existent', None),
-    ('div', {'class': 'non-existent'}, 'id', None),
-))
-def test_get_attr(div, selector, attr, expected):
-    html = '''
-        <div class="outer">
-            <div class="inner" id="target" data-value="123">Some text</div>
-        </div>
-    '''
-    soup = BeautifulSoup(html, 'html.parser')
-    outer = soup.find('div', {'class': 'outer'})
-
-    assert get_attr(outer, div, selector, attr) == expected
-
-
-@pytest.mark.parametrize('attr, processor, expected', (
-    (None, lambda v: v.text.upper(), 'SOME TEXT'),
-    (None, lambda v: 'ble ble ble', 'ble ble ble'),
-
-    ('data-value', int, 123),
-    ('class', lambda v: v + ['bla bla'], ['inner', 'bla bla']),
-))
-def test_get_attr_post_processing(attr, processor, expected):
-    html = '''
-        <div class="outer">
-            <div class="inner" id="target" data-value="123">Some text</div>
-        </div>
-    '''
-    soup = BeautifulSoup(html, 'html.parser')
-    outer = soup.find('div', {'class': 'outer'})
-
-    assert get_attr(outer, 'div', {'class': 'inner'}, attr, processor) == expected
-
-
-@pytest.mark.parametrize('text, expected_karma, expected_score', (
-    ('1 point', {'LW': 1}, 1),
-    ('432 points', {'LW': 432}, 432),
-
-    ('LW: 32 AF: 42 EA: 12', {'LW': 32, 'AF': 42, 'EA': 12}, 32),
-    ('AF: 42 LW: 32 EA: 12', {'LW': 32, 'AF': 42, 'EA': 12}, 42),
-))
-def test_parse_karma(text, expected_karma, expected_score):
-    html = f'''<div>
-        <a href="/bla/bla" class="lw2-link" title="LW">Less Wrong</a>
-        <span class="karma-value">{text}</span>
-    </div>
-    '''
-    meta_div = BeautifulSoup(html, 'html.parser')
-    score, karma = parse_karma(meta_div)
-    assert score == expected_score
-    assert karma == expected_karma
-
-
-@pytest.mark.parametrize('html', (
-    '<div></div>',
-    '''<div>
-        <a href="/bla/bla" class="lw2-link" title="LW">Less Wrong</a>
-    </div>''',
-    '''<div>
-        <a href="/bla/bla" class="lw2-link" title="LW">Less Wrong</a>
-        <span class="asdasdad"></span>
-    </div>''',
-))
-def test_parse_karma_missing_counts(html):
-    # Test case when Karma and site are None
-    meta_div = BeautifulSoup(html, 'html.parser')
-    score, karma = parse_karma(meta_div)
-    assert score == None
-    assert karma == {}
-
-
-def test_extract_metadata():
-    html = '''
-    <main class="post">
-      <h1 class="post-title">Outside the Laboratory</h1>
-      <div class="post-meta top-post-meta">
-        <a class="author" data-full-name="" data-userid="nmk3nLpQE89dMRzzN" href="/users/eliezer_yudkowsky">Eliezer Yudkowsky</a>
-        <span class="date hide-until-init" data-js-date="1169351217000">21 Jan 2007 3:46 UTC <script async="" src="data:text/javascript,prettyDate()"></script></span>
-        <div class="karma voting-controls" data-post-id="N2pENnTPB75sfc9kb">
-            <button autocomplete="off" class="vote upvote" data-target-type="Post" data-vote-type="upvote" disabled="" tabindex="-1" type="button">
-            </button>
-            <span class="karma-value" title="108 votes"> 132 <span> points </span></span>
-            <button autocomplete="off" class="vote downvote" data-target-type="Post" data-vote-type="downvote" disabled="" tabindex="-1" type="button"> </button>
-        </div> <a class="comment-count" href="#comments"> 351 <span> comments </span> </a>
-        <a class="lw2-link" href="https://www.lesswrong.com/posts/N2pENnTPB75sfc9kb/outside-the-laboratory"> LW <span> link </span> </a>
-        <a class="archive-link" href="https://web.archive.org/web/*/http://lesswrong.com/lw/gv/outside_the_laboratory"> Archive </a>
-        <a class="post-section frontpage" href="/" title="View Frontpage posts"> </a>
-        <div id="tags">
-            <a href="/tag/law-thinking"> Law-Thinking </a>
-            <a href="/tag/rationality"> Rationality </a>
-            <a href="/tag/practice-and-philosophy-of-science"> Practice &amp; Philosophy of Science </a>
-            <a href="/tag/religion"> Religion </a>
-            <a href="/tag/compartmentalization"> Compartmentalization </a>
-        </div>
-        <nav class="qualified-linking">
-            <input id="qualified-linking-toolbar-toggle-checkbox-top" tabindex="-1" type="checkbox"/>
-            <label for="qualified-linking-toolbar-toggle-checkbox-top"> <span>  </span> </label>
-            <div class="qualified-linking-toolbar">
-                <a href="/posts/N2pENnTPB75sfc9kb/outside-the-laboratory"> Post permalink </a>
-                <a href="/posts/N2pENnTPB75sfc9kb/outside-the-laboratory?comments=false"> Link without comments </a>
-                <a href="/posts/N2pENnTPB75sfc9kb/outside-the-laboratory?hide-nav-bars=true"> Link without top nav bars </a>
-                <a href="/posts/N2pENnTPB75sfc9kb/outside-the-laboratory?comments=false&amp;hide-nav-bars=true"> Link without comments or top nav bars </a>
-            </div>
-        </nav>
-      </div>
-</main>
-    '''
-    post = BeautifulSoup(html, 'html.parser').find('h1')
-    assert extract_metadata('http://bla.bla', post, post.find_next_sibling('div')) == {
-        'title': 'Outside the Laboratory',
-        'url': 'https://www.lesswrong.com/posts/N2pENnTPB75sfc9kb/outside-the-laboratory',
-        'authors': [{
-            'fullName': '',
-            'userId': 'nmk3nLpQE89dMRzzN',
-            'userLink': 'http://bla.bla/users/eliezer_yudkowsky',
-            'name': 'Eliezer Yudkowsky'
-        }],
-        'date_published': '2007-01-21T03:46:00',
-        'votes': 108,
-        'score': 132,
-        'karma': {' LW ': 132},
-        'tags': ['Law-Thinking', 'Rationality', 'Practice & Philosophy of Science', 'Religion', 'Compartmentalization']
-    }
-
-
-def test_extract_metadata_remove_empty():
-    html = '<main class="post"> <h1 class="post-title">Outside the Laboratory</h1> </main>'
-    post = BeautifulSoup(html, 'html.parser').find('h1')
-    assert extract_metadata('http://bla.bla', post) == {'title': 'Outside the Laboratory'}