restructure scripts folder

worldbank · Nov 3, 2023 · 37d7b9d · 37d7b9d
1 parent c28054b
commit 37d7b9d
Show file tree

Hide file tree

Showing 26 changed files with 190 additions and 155 deletions.
diff --git a/docs/tourism/utsa.md b/docs/tourism/utsa.md
@@ -1,6 +1,6 @@
 # Individual forecasts
 
-As \textcite{song2019review} illustrates, in tourism research, *time series* models forecast demands, while *econometric* models search for the causes and effects between economic factors and tourism demands. The empirical models include *time series* models, in which SARIMAX and VAR build uni- and multi-variate relationships, respectively, *econometric* model, which attempts to establish a relationship between the Google Trends index and invented ratio, and the forecast combination by linear and least square approaches.
+As {cite:t}`song2019review` illustrates, in tourism research, *time series* models forecast demands, while *econometric* models search for the causes and effects between economic factors and tourism demands. The empirical models include *time series* models, in which SARIMAX and VAR build uni- and multi-variate relationships, respectively, *econometric* model, which attempts to establish a relationship between the Google Trends index and invented ratio, and the forecast combination by linear and least square approaches.
 
 ## SARIMAX
 

diff --git a/notebooks/tourism/gt-snowball.ipynb b/notebooks/tourism/gt-snowball.ipynb
@@ -0,0 +1,120 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "5476e3ee",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.chdir(\"../../\")\n",
+    "from dotenv import load_dotenv\n",
+    "load_dotenv()\n",
+    "\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from src.google_trends import GT"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "55d51a03",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "GoogleAPIkey = os.getenv(\"GoogleAPIkey\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "9eecefe8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gt = GT(GoogleAPIkey)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "07aad8e2",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'item': [{'title': 'Fiji', 'mid': '/m/02wt0', 'value': 100},\n",
+       "  {'title': 'Fijian language', 'mid': '/m/020gps', 'value': 27},\n",
+       "  {'title': 'Concert tour', 'mid': '/g/122cngsd', 'value': 14},\n",
+       "  {'title': 'Island', 'mid': '/m/03s0c', 'value': 13},\n",
+       "  {'title': 'Package tour', 'mid': '/m/05x8w9', 'value': 11},\n",
+       "  {'title': 'Travel', 'mid': '/m/014dsx', 'value': 10},\n",
+       "  {'title': 'Resort', 'mid': '/m/02dkrm', 'value': 8},\n",
+       "  {'title': 'Australia', 'mid': '/m/0chghy', 'value': 7},\n",
+       "  {'title': 'Flight', 'mid': '/m/01515d', 'value': 7},\n",
+       "  {'title': 'New Zealand', 'mid': '/m/0ctw_b', 'value': 7},\n",
+       "  {'title': 'Nadi', 'mid': '/m/022wr3', 'value': 5},\n",
+       "  {'title': 'Holiday', 'mid': '/m/03gkl', 'value': 4},\n",
+       "  {'title': 'Cruise ship', 'mid': '/m/01tq2l', 'value': 4},\n",
+       "  {'title': 'Tourism', 'mid': '/g/120yrv6h', 'value': 4},\n",
+       "  {'title': 'Tour operator', 'mid': '/m/0477l6', 'value': 3},\n",
+       "  {'title': 'Sydney', 'mid': '/m/06y57', 'value': 3},\n",
+       "  {'title': 'Airport', 'mid': '/m/09cjl', 'value': 3},\n",
+       "  {'title': 'Concert', 'mid': '/m/01jddz', 'value': 3},\n",
+       "  {'title': 'Rugby union', 'mid': '/m/06br8', 'value': 3},\n",
+       "  {'title': 'Vacation', 'mid': '/m/02jwqh', 'value': 3},\n",
+       "  {'title': 'Beach', 'mid': '/m/0b3yr', 'value': 3},\n",
+       "  {'title': 'Location', 'mid': '/m/078x4m', 'value': 3},\n",
+       "  {'title': 'Travel visa', 'mid': '/m/01wtrk', 'value': 3},\n",
+       "  {'title': 'World Surf League', 'mid': '/m/089p0g', 'value': 2},\n",
+       "  {'title': 'Country', 'mid': '/m/01lff', 'value': 2}]}"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "gt.get_top_topics(\"Fiji Tour\", None)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "po",
+   "language": "python",
+   "name": "po"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.0"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": true,
+   "toc_window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/scripts/python/GoogleTrends.py → scripts/archived/GoogleTrends.py b/scripts/python/GoogleTrends.py → scripts/archived/GoogleTrends.py
diff --git a/scripts/python/tsa/mtsmodel.py → scripts/archived/tsa/mtsmodel.py b/scripts/python/tsa/mtsmodel.py → scripts/archived/tsa/mtsmodel.py
diff --git a/scripts/python/tsa/ts_eval.py → scripts/archived/tsa/ts_eval.py b/scripts/python/tsa/ts_eval.py → scripts/archived/tsa/ts_eval.py
diff --git a/scripts/python/tsa/ts_utils.py → scripts/archived/tsa/ts_utils.py b/scripts/python/tsa/ts_utils.py → scripts/archived/tsa/ts_utils.py
diff --git a/scripts/python/tsa/utsmodel.py → scripts/archived/tsa/utsmodel.py b/scripts/python/tsa/utsmodel.py → scripts/archived/tsa/utsmodel.py
diff --git a/scripts/python/get_land_cover_net.py → scripts/get_land_cover_net.py b/scripts/python/get_land_cover_net.py → scripts/get_land_cover_net.py
diff --git a/scripts/python/CleanFiji.py → scripts/parsing/CleanFiji.py b/scripts/python/CleanFiji.py → scripts/parsing/CleanFiji.py
diff --git a/scripts/python/ParseCombine.py → scripts/parsing/ParseCombine.py b/scripts/python/ParseCombine.py → scripts/parsing/ParseCombine.py
diff --git a/scripts/python/PdfParse-tg-monthly-exec.py → scripts/parsing/PdfParse-tg-monthly-exec.py b/scripts/python/PdfParse-tg-monthly-exec.py → scripts/parsing/PdfParse-tg-monthly-exec.py
diff --git a/scripts/python/PdfParse-tg-origin-exec.py → scripts/parsing/PdfParse-tg-origin-exec.py b/scripts/python/PdfParse-tg-origin-exec.py → scripts/parsing/PdfParse-tg-origin-exec.py
diff --git a/scripts/python/PdfParse-vu-visitor-exec.py → scripts/parsing/PdfParse-vu-visitor-exec.py b/scripts/python/PdfParse-vu-visitor-exec.py → scripts/parsing/PdfParse-vu-visitor-exec.py
diff --git a/scripts/python/PdfParse.py → scripts/parsing/PdfParse.py b/scripts/python/PdfParse.py → scripts/parsing/PdfParse.py
diff --git a/scripts/python/nlp/lda.py b/scripts/python/nlp/lda.py
diff --git a/scripts/python/nlp/utils.py b/scripts/python/nlp/utils.py
diff --git a/scripts/python/scraper/rnz.py b/scripts/python/scraper/rnz.py
diff --git a/scripts/python/scraper/abc_au.py → scripts/scraping/abc_au.py b/scripts/python/scraper/abc_au.py → scripts/scraping/abc_au.py
@@ -1,5 +1,5 @@
 import os
-os.chdir("../../../")
+os.chdir("../../")
 import pandas as pd
 import json
 from src.scraper.scrape import *

diff --git a/scripts/python/scraper/fiji.py → scripts/scraping/fiji.py b/scripts/python/scraper/fiji.py → scripts/scraping/fiji.py
diff --git a/scripts/scraping/rnz.py b/scripts/scraping/rnz.py
@@ -1,13 +1,57 @@
 import os
-os.chdir("../../")
-import json
+os.chdir("../../../")
 from src.scraper.scrape import *
 
-scraper = WebScraper(
-    url=
-    "https://www.theguardian.com/world/2023/aug/12/between-two-worlds-life-of-png-tribe-leader-and-plantation-owner-honoured",
-    parser="xpath")
-scraper.load_page()
-scraper.parse_page()
+# Set up
+target_dir = os.getcwd() + "/data/text/rnz/"
+if not os.path.exists(target_dir):
+    os.mkdir(target_dir)
 
-print(scraper.parsed_content)
+host_url = "https://www.rnz.co.nz"
+countries = ["Solomon Islands"]
+
+for country in countries:
+    filename = country.replace(" ", "_").lower() + "_rnz_urls.csv"
+    country_base_url = host_url + "/tags/" + str(country) + "?page="
+    country_urls = [country_base_url + str(i) for i in range(1, 500)]
+
+    scraper = WebScraper(parser="html.parser")
+    data = scraper.scrape_urls(country_urls, "o-digest__detail")
+
+    output = []
+    for pg in data:
+        for i in pg:
+            output.append([i.find("h3").text,
+                           i.find("span").text,
+                           i.find("a")["href"]])
+
+    rnz_df = pd.DataFrame(
+        output, columns=["title", "date", "url"]).drop_duplicates()
+    rnz_df["news"] = rnz_df["url"].apply(
+        lambda x: x.split("/")[2] != "programmes")
+    rnz_df["url"] = [host_url + str(url) for url in rnz_df.url]
+
+    # Save url files
+    rnz_df.to_csv(target_dir + filename, encoding="utf-8")
+
+for country in countries:
+    country_filepath = target_dir + country.lower().replace(" ", "_") + \
+        "_rnz_urls.csv"
+    df = pd.read_csv(country_filepath).drop("Unnamed: 0", axis=1)
+    news_urls = df[df.news == True]["url"].tolist()
+    scraper = WebScraper(parser="html.parser")
+    nested_data = scraper.scrape_urls(news_urls, "article__body", speed_up=True)
+
+    news_output = []
+    for url, i in nested_data:
+        try:
+            text = "".join(p.text for p in i[0].find_all("p"))
+            news_output.append([url, text])
+        except Exception as e:
+            print(f"An Error has occured: {e}.")
+            news_output.append([url, np.NAN])
+
+    country_news_df = pd.DataFrame(news_output, columns=["url", "news"])
+    news_filepath = target_dir + country.lower().replace(" ", "_") + \
+        "_rnz_news.csv"
+    country_news_df.to_csv(news_filepath, encoding="utf-8")
diff --git a/scripts/python/scraper/solomon.py → scripts/scraping/solomon.py b/scripts/python/scraper/solomon.py → scripts/scraping/solomon.py
diff --git a/scripts/python/scraper/tonga.py → scripts/scraping/tonga.py b/scripts/python/scraper/tonga.py → scripts/scraping/tonga.py
diff --git a/scripts/python/scraper/utils.py → scripts/scraping/utils.py b/scripts/python/scraper/utils.py → scripts/scraping/utils.py
diff --git a/scripts/python/scraper/vanuatu.py → scripts/scraping/vanuatu.py b/scripts/python/scraper/vanuatu.py → scripts/scraping/vanuatu.py
diff --git a/scripts/python/utils.py → scripts/utils.py b/scripts/python/utils.py → scripts/utils.py
diff --git a/src/scraper/scrape.py b/src/scraper/scrape.py
@@ -36,7 +36,7 @@ def __init__(self, parser="xpath", headers=None):
                 "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
             }
 
-    def request_url(self, url, timeout=5):
+    def request_url(self, url, timeout=30):
         """
         Sends an HTTP GET request to the specified URL.
 
@@ -110,23 +110,26 @@ def scrape_urls(self, urls, expression, speed_up=False):
         if not isinstance(urls, list):
             raise TypeError("The 'urls' argument must be a list of URLs.")
 
-        if isinstance(expression, str):
-            expression = [expression] * len(urls)
+        # if isinstance(expression, str):
+        #     expression = [expression] * len(urls)
 
-        if not isinstance(expression, list) or len(expression) != len(urls):
-            raise ValueError(
-                "The 'expression' argument must be a string or a list of the same length as 'urls'."
-            )
+        # if not isinstance(expression, list) or len(expression) != len(urls):
+        #     raise ValueError(
+        #         "The 'expression' argument must be a string or a list of the same length as 'urls'."
+        #     )
+
+        # if isinstance(expression, list):
+        #     expression = [expression * len(urls)]
 
         scraped_data = []
         if speed_up:
             with tqdm(total=len(urls)) as pbar:
                 max_workers = multiprocessing.cpu_count() + 4
                 with ThreadPoolExecutor(max_workers=max_workers) as executor:
-                    future_to_url = {executor.submit(self.scrape_url, url, expr): (
-                        url, expr) for url, expr in zip(urls, expression)}
+                    future_to_url = {executor.submit(self.scrape_url, url, expression): (
+                        url) for url in urls}
                     for future in as_completed(future_to_url):
-                        url, _ = future_to_url[future]
+                        url = future_to_url[future]
                         try:
                             data = future.result()
                         except Exception as exc:
@@ -136,8 +139,8 @@ def scrape_urls(self, urls, expression, speed_up=False):
                             pbar.update(1)
         else:
             with tqdm(total=len(urls)) as pbar:
-                for url, expr in zip(urls, expression):
-                    data = self.scrape_url(url, expr)
+                for url in urls:
+                    data = self.scrape_url(url, expression)
                     scraped_data.append(data)
                     pbar.update(1)