Merge pull request #9 from datasciencecampus/7-scrape-daily-content

7 scrape daily content
datasciencecampus · Aug 16, 2024 · a44e920 · a44e920
2 parents bd463e1 + e30f8b4
commit a44e920
Show file tree

Hide file tree

Showing 10 changed files with 1,571 additions and 4 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,7 +1,7 @@
 ### Project structure ###
 data/*
-outputs/*
 .vscode
+out/*
 
 ### Contentious file types (can be removed at user discretion) ###
 

diff --git a/README.md b/README.md
@@ -49,7 +49,37 @@ Once pre-commits are activated, whenever you commit to this repository a series
 **NOTE:** Pre-commit hooks execute Python, so it expects a working Python build.
 
 ## Usage
-Details to follow...
+[theyworkforyou.com](https://www.theyworkforyou.com)
+
+By default, parliamentary content from the previous day (and anything so far on the current day) will be reviewed. However, a number of flags are available for use from the command line. The main time filtering behaviours can be summarised as follows:
+
+- previous day (default) e.g.
+
+``` bash
+$ python scripts/theyworkforyou.py
+```
+
+- specify day with optional end date (`-d` or `--end`) e.g.
+
+``` bash
+$ python scripts/theyworkforyou.py -d 2024-05-20
+```
+
+- range from start date (`-s` or `--start`) to end date e.g.
+
+``` bash
+$ python scripts/theyworkforyou.py -s 2024-05-20 -d 2024-05-24
+```
+
+- look behind from end date (optional) by a specified window of days (inclusive; `-n` or `--window`) e.g.
+
+``` bash
+$ python scripts/theyworkforyou.py -d 2024-05-24 -n 3
+```
+
+Additionally, the `-w` or `--weekly` flag can be used to generate a report for the previous week e.g. a Wednesday to a Wednesday. The `-f` or `--form` flag can also be applied to specify a preferred date format (other than the default of %Y-%m-%d).
+
+This demo is tested on the Gemma (1st version) model. It assumes that [Ollama](https://www.ollama.com) has been installed locally and the required model has been downloaded before using ParliAI.
 
 ### Workflow
 Details to follow...

diff --git a/data/.gitkeep → out/.gitkeep b/data/.gitkeep → out/.gitkeep
diff --git a/scripts/theyworkforyou.py b/scripts/theyworkforyou.py
@@ -0,0 +1,215 @@
+"""Script for extracting parliamentary content from TheyWorkForYou."""
+
+import argparse
+import datetime as dt
+import os
+
+import tqdm
+
+from parliai_public import dates
+from parliai_public.readers import Debates, WrittenAnswers
+
+
+def create_reader(
+    reader_class: type[Debates] | type[WrittenAnswers],
+    toml: None | str = None,
+    date_list: None | list[dt.date] = None,
+    llm_name: None | str = None,
+):
+    """
+    Create an instance of a reader class.
+
+    Parameters
+    ----------
+    reader_class : type[Debates] | type[WrittenAnswers]
+        Class to instantiate.
+    toml : str, optional
+        Path to TOML configuration file. If not specified, the
+        default for the class is used.
+    date_list : list[dt.date], optional
+        List of dates to cover. If not specified, the default
+        for the reader class is used.
+    llm_name : str, optional
+        Name of model (only locally-installed Ollama-based LLMs
+        in this demo). 'gemma' by default.
+
+    Returns
+    -------
+    reader : Debates | WrittenAnswers
+        An instantiated reader.
+    """
+
+    reader = reader_class.from_toml(toml)
+    if date_list:
+        reader.dates = date_list
+    reader.llm_name = "gemma" if llm_name is None else llm_name
+
+    return reader
+
+
+def make_summary(
+    reader: Debates | WrittenAnswers,
+    header: str,
+    save: bool = True,
+) -> str:
+    """
+    Collect and summarise the latest entries in Parliament.
+
+    Users have a choice for how they would like to define "latest":
+
+    1. Providing a specific date.
+    2. Defining a reporting period with start and end dates.
+    3. Specifying a date and a number of days to look back over
+       (inclusive of the provided end date).
+    4. Providing nothing will have the reader only look at yesterday.
+
+    Parameters
+    ----------
+    reader : Debates | WrittenAnswers
+        Reader to use in analysis.
+    header : str
+        Section header for the reader.
+    save : bool, default=True
+        Whether to save the collected and analysed transcripts.
+
+    Returns
+    -------
+    summary : str
+        Stylised summary of entries in Markdown syntax.
+    """
+
+    entries = reader.retrieve_latest_entries()
+    sections = []
+    content = ""
+
+    if entries:
+        width = max(map(len, entries))
+        for entry in (pbar := tqdm.tqdm(entries)):
+            pbar.set_description(f"Processing {entry.ljust(width)}")
+            page = reader.read(entry)
+            if page:
+                analysed = reader.analyse(page)
+                rendering = reader.render(analysed)
+                sections.append(rendering)
+                if save:
+                    reader.save(analysed)
+
+        content = "\n\n".join(sections)
+
+    if content == "":
+        content = "No relevant content found for this period."
+
+    summary = "\n\n".join((header, content))
+
+    return summary
+
+
+def main():
+    """Summarise the latest communications in Parliament."""
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-s",
+        "--start",
+        type=str,
+        required=False,
+        help="start of reporting period (default format YYYY-MM-DD)",
+    )
+    parser.add_argument(
+        "-d",
+        "--end",
+        type=str,
+        required=False,
+        help="end of reporting period (default format YYYY-MM-DD)",
+    )
+    parser.add_argument(
+        "-n",
+        "--window",
+        type=int,
+        required=False,
+        help="length of reporting period (inclusive of `end`)",
+    )
+    parser.add_argument(
+        "-f",
+        "--form",
+        type=str,
+        default="%Y-%m-%d",
+        help="date string format using directive notation (default %Y-%m-%d)",
+    )
+    parser.add_argument(
+        "--debates-toml",
+        type=str,
+        required=False,
+        help="path to debates TOML configuration file",
+    )
+    parser.add_argument(
+        "--written-toml",
+        type=str,
+        required=False,
+        help="path to written answers TOML configuration file",
+    )
+    parser.add_argument(
+        "-w",
+        "--weekly",
+        required=False,
+        action="store_true",
+        help="trigger a weekly report from today",
+    )
+    parser.add_argument(
+        "--no-save",
+        required=False,
+        action="store_true",
+        help="do not save data from collected pages",
+    )
+    args = vars(parser.parse_args())
+
+    start = args.get("start")
+    end = args.get("end")
+    window = args.get("window")
+    form = args["form"]
+    save = not args["no_save"]
+
+    if args.get("weekly"):
+        start, end, window = None, None, 8
+
+    date_list = None
+    if start or end or window:
+        date_list = dates.list_dates(start, end, window, form)
+
+    debates = create_reader(
+        reader_class=Debates,
+        toml=args.get("debates_toml"),
+        date_list=date_list,
+    )
+    written = create_reader(
+        reader_class=WrittenAnswers,
+        toml=args.get("written_toml"),
+        date_list=date_list,
+    )
+
+    # TODO: refactor to single LLM instantiation
+    debates.instantiate_llm()
+    written.instantiate_llm()
+
+    debates.make_outdir()
+    written.outdir = debates.outdir
+
+    summary = "\n\n".join(
+        (
+            debates.make_header(urls=debates.urls + written.urls),
+            make_summary(debates, "# Debates", save),
+            make_summary(
+                written, "# Written answers (UK Parliament only)", save
+            ),
+        )
+    )
+
+    print("Saving summary...")
+    with open(os.path.join(debates.outdir, "summary.md"), "w") as f:
+        f.write(summary)
+
+    print("Done! ✅")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/parliai_public/_config/debates.toml b/src/parliai_public/_config/debates.toml
@@ -24,4 +24,4 @@ Now extract all relevant content from the following text:
 
 outdir = "out/theyworkforyou"
 
-llm_name = "gemma:2b"
+llm_name = "gemma"
diff --git a/src/parliai_public/_config/wrans.toml b/src/parliai_public/_config/wrans.toml
@@ -16,4 +16,4 @@ Now extract all relevant content from the following text:
 
 outdir = "out/theyworkforyou"
 
-llm_name = "gemma:2b"
+llm_name = "gemma"