Skip to content

Commit

Permalink
restructure scripts folder
Browse files Browse the repository at this point in the history
  • Loading branch information
ccxzhang committed Nov 3, 2023
1 parent c28054b commit 37d7b9d
Show file tree
Hide file tree
Showing 26 changed files with 190 additions and 155 deletions.
2 changes: 1 addition & 1 deletion docs/tourism/utsa.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Individual forecasts

As \textcite{song2019review} illustrates, in tourism research, *time series* models forecast demands, while *econometric* models search for the causes and effects between economic factors and tourism demands. The empirical models include *time series* models, in which SARIMAX and VAR build uni- and multi-variate relationships, respectively, *econometric* model, which attempts to establish a relationship between the Google Trends index and invented ratio, and the forecast combination by linear and least square approaches.
As {cite:t}`song2019review` illustrates, in tourism research, *time series* models forecast demands, while *econometric* models search for the causes and effects between economic factors and tourism demands. The empirical models include *time series* models, in which SARIMAX and VAR build uni- and multi-variate relationships, respectively, *econometric* model, which attempts to establish a relationship between the Google Trends index and invented ratio, and the forecast combination by linear and least square approaches.

## SARIMAX

Expand Down
120 changes: 120 additions & 0 deletions notebooks/tourism/gt-snowball.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "5476e3ee",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"os.chdir(\"../../\")\n",
"from dotenv import load_dotenv\n",
"load_dotenv()\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"from src.google_trends import GT"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "55d51a03",
"metadata": {},
"outputs": [],
"source": [
"GoogleAPIkey = os.getenv(\"GoogleAPIkey\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "9eecefe8",
"metadata": {},
"outputs": [],
"source": [
"gt = GT(GoogleAPIkey)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "07aad8e2",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'item': [{'title': 'Fiji', 'mid': '/m/02wt0', 'value': 100},\n",
" {'title': 'Fijian language', 'mid': '/m/020gps', 'value': 27},\n",
" {'title': 'Concert tour', 'mid': '/g/122cngsd', 'value': 14},\n",
" {'title': 'Island', 'mid': '/m/03s0c', 'value': 13},\n",
" {'title': 'Package tour', 'mid': '/m/05x8w9', 'value': 11},\n",
" {'title': 'Travel', 'mid': '/m/014dsx', 'value': 10},\n",
" {'title': 'Resort', 'mid': '/m/02dkrm', 'value': 8},\n",
" {'title': 'Australia', 'mid': '/m/0chghy', 'value': 7},\n",
" {'title': 'Flight', 'mid': '/m/01515d', 'value': 7},\n",
" {'title': 'New Zealand', 'mid': '/m/0ctw_b', 'value': 7},\n",
" {'title': 'Nadi', 'mid': '/m/022wr3', 'value': 5},\n",
" {'title': 'Holiday', 'mid': '/m/03gkl', 'value': 4},\n",
" {'title': 'Cruise ship', 'mid': '/m/01tq2l', 'value': 4},\n",
" {'title': 'Tourism', 'mid': '/g/120yrv6h', 'value': 4},\n",
" {'title': 'Tour operator', 'mid': '/m/0477l6', 'value': 3},\n",
" {'title': 'Sydney', 'mid': '/m/06y57', 'value': 3},\n",
" {'title': 'Airport', 'mid': '/m/09cjl', 'value': 3},\n",
" {'title': 'Concert', 'mid': '/m/01jddz', 'value': 3},\n",
" {'title': 'Rugby union', 'mid': '/m/06br8', 'value': 3},\n",
" {'title': 'Vacation', 'mid': '/m/02jwqh', 'value': 3},\n",
" {'title': 'Beach', 'mid': '/m/0b3yr', 'value': 3},\n",
" {'title': 'Location', 'mid': '/m/078x4m', 'value': 3},\n",
" {'title': 'Travel visa', 'mid': '/m/01wtrk', 'value': 3},\n",
" {'title': 'World Surf League', 'mid': '/m/089p0g', 'value': 2},\n",
" {'title': 'Country', 'mid': '/m/01lff', 'value': 2}]}"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gt.get_top_topics(\"Fiji Tour\", None)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "po",
"language": "python",
"name": "po"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.0"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 5
}
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
41 changes: 0 additions & 41 deletions scripts/python/nlp/lda.py

This file was deleted.

34 changes: 0 additions & 34 deletions scripts/python/nlp/utils.py

This file was deleted.

57 changes: 0 additions & 57 deletions scripts/python/scraper/rnz.py

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os
os.chdir("../../../")
os.chdir("../../")
import pandas as pd
import json
from src.scraper.scrape import *
Expand Down
File renamed without changes.
62 changes: 53 additions & 9 deletions scripts/scraping/rnz.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,57 @@
import os
os.chdir("../../")
import json
os.chdir("../../../")
from src.scraper.scrape import *

scraper = WebScraper(
url=
"https://www.theguardian.com/world/2023/aug/12/between-two-worlds-life-of-png-tribe-leader-and-plantation-owner-honoured",
parser="xpath")
scraper.load_page()
scraper.parse_page()
# Set up
target_dir = os.getcwd() + "/data/text/rnz/"
if not os.path.exists(target_dir):
os.mkdir(target_dir)

print(scraper.parsed_content)
host_url = "https://www.rnz.co.nz"
countries = ["Solomon Islands"]

for country in countries:
filename = country.replace(" ", "_").lower() + "_rnz_urls.csv"
country_base_url = host_url + "/tags/" + str(country) + "?page="
country_urls = [country_base_url + str(i) for i in range(1, 500)]

scraper = WebScraper(parser="html.parser")
data = scraper.scrape_urls(country_urls, "o-digest__detail")

output = []
for pg in data:
for i in pg:
output.append([i.find("h3").text,
i.find("span").text,
i.find("a")["href"]])

rnz_df = pd.DataFrame(
output, columns=["title", "date", "url"]).drop_duplicates()
rnz_df["news"] = rnz_df["url"].apply(
lambda x: x.split("/")[2] != "programmes")
rnz_df["url"] = [host_url + str(url) for url in rnz_df.url]

# Save url files
rnz_df.to_csv(target_dir + filename, encoding="utf-8")

for country in countries:
country_filepath = target_dir + country.lower().replace(" ", "_") + \
"_rnz_urls.csv"
df = pd.read_csv(country_filepath).drop("Unnamed: 0", axis=1)
news_urls = df[df.news == True]["url"].tolist()
scraper = WebScraper(parser="html.parser")
nested_data = scraper.scrape_urls(news_urls, "article__body", speed_up=True)

news_output = []
for url, i in nested_data:
try:
text = "".join(p.text for p in i[0].find_all("p"))
news_output.append([url, text])
except Exception as e:
print(f"An Error has occured: {e}.")
news_output.append([url, np.NAN])

country_news_df = pd.DataFrame(news_output, columns=["url", "news"])
news_filepath = target_dir + country.lower().replace(" ", "_") + \
"_rnz_news.csv"
country_news_df.to_csv(news_filepath, encoding="utf-8")
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
27 changes: 15 additions & 12 deletions src/scraper/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def __init__(self, parser="xpath", headers=None):
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
}

def request_url(self, url, timeout=5):
def request_url(self, url, timeout=30):
"""
Sends an HTTP GET request to the specified URL.
Expand Down Expand Up @@ -110,23 +110,26 @@ def scrape_urls(self, urls, expression, speed_up=False):
if not isinstance(urls, list):
raise TypeError("The 'urls' argument must be a list of URLs.")

if isinstance(expression, str):
expression = [expression] * len(urls)
# if isinstance(expression, str):
# expression = [expression] * len(urls)

if not isinstance(expression, list) or len(expression) != len(urls):
raise ValueError(
"The 'expression' argument must be a string or a list of the same length as 'urls'."
)
# if not isinstance(expression, list) or len(expression) != len(urls):
# raise ValueError(
# "The 'expression' argument must be a string or a list of the same length as 'urls'."
# )

# if isinstance(expression, list):
# expression = [expression * len(urls)]

scraped_data = []
if speed_up:
with tqdm(total=len(urls)) as pbar:
max_workers = multiprocessing.cpu_count() + 4
with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_url = {executor.submit(self.scrape_url, url, expr): (
url, expr) for url, expr in zip(urls, expression)}
future_to_url = {executor.submit(self.scrape_url, url, expression): (
url) for url in urls}
for future in as_completed(future_to_url):
url, _ = future_to_url[future]
url = future_to_url[future]
try:
data = future.result()
except Exception as exc:
Expand All @@ -136,8 +139,8 @@ def scrape_urls(self, urls, expression, speed_up=False):
pbar.update(1)
else:
with tqdm(total=len(urls)) as pbar:
for url, expr in zip(urls, expression):
data = self.scrape_url(url, expr)
for url in urls:
data = self.scrape_url(url, expression)
scraped_data.append(data)
pbar.update(1)

Expand Down

0 comments on commit 37d7b9d

Please sign in to comment.