This repository has been archived by the owner on May 30, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsetup.py
146 lines (116 loc) · 4.62 KB
/
setup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
from google.cloud import bigquery
from queries import *
import json
with open('config.json', 'r') as f:
config = json.load(f)
project_id = config['project_id']
common_dataset_id = config['common_dataset_id']
country_id = config['country_id']
crux_month = get_crux_latest_month(project_id,country_id)
ha_date = get_ha_latest_month(project_id)
origin_appearances_table= config['origin_appearances_table']
"""# Identify Local Websites
We're defining local websites as the websites that only accessed by users in a country and not exist in other countries dataset. Using that definition than we need to check all the website that exist in each country datasets.
### Count origin apperances
"""
client = bigquery.Client(project=project_id)
common_dataset=project_id+'.'+common_dataset_id;
global_appearance_table=common_dataset+'.'+origin_appearances_table;
# Make sure we have common dataset
try:
dataset=client.get_dataset(common_dataset)
except:
# Construct a full Dataset object to send to the API.
datasetref = client.dataset(common_dataset_id)
dataset = bigquery.Dataset(datasetref)
# Specify the geographic location where the dataset should reside.
dataset.location = "US"
# Send the dataset to the API for creation.
# Raises google.api_core.exceptions.Conflict if the Dataset already
# exists within the project.
dataset = client.create_dataset(dataset) # API request
print("Created dataset {}.{}".format(client.project, dataset.dataset_id))
# Create appearance data
job_config = bigquery.QueryJobConfig()
job_config.write_disposition='WRITE_TRUNCATE';
table_ref = client.dataset(common_dataset_id).table(origin_appearances_table)
job_config.destination = table_ref
sql=f'''
SELECT
origin,
count(country) AS appearances
FROM `chrome-ux-report.materialized.country_summary`
WHERE
yyyymm = '{crux_month}'
GROUP BY origin
'''
query_job = client.query(sql,location='US',job_config=job_config)
# API request - starts the query
query_job.result() # Waits for the query to finish
print('View global appearance created to table {}'.format(table_ref.path))
"""### Create the country dataset"""
# Set dataset_id to the ID of the dataset to create.
country_dataset_id = f'{client.project}.{country_id}'
try:
dataset=client.get_dataset(country_dataset_id)
print("Dataset {}.{} exist!".format(client.project, dataset.dataset_id))
except:
# Construct a full Dataset object to send to the API.
datasetref = client.dataset(country_id)
dataset = bigquery.Dataset(datasetref)
# Specify the geographic location where the dataset should reside.
dataset.location = "US"
# Send the dataset to the API for creation.
# Raises google.api_core.exceptions.Conflict if the Dataset already
# exists within the project.
dataset = client.create_dataset(dataset) # API request
print("Created dataset {}.{}".format(client.project, dataset.dataset_id))
"""### Create view for selecting the origins those only accessed from a country"""
# Set the destination table
table_ref = client.dataset(country_id).table('origins')
job_config.destination = table_ref
sql=f'''
SELECT
DISTINCT origin
FROM `chrome-ux-report.country_{country_id}.{crux_month}`
WHERE origin in (
SELECT origin
FROM `{global_appearance_table}`
WHERE appearances = 1
)
'''
query_job = client.query(
sql,
# Location must match that of the dataset(s) referenced in the query
# and of the destination table.
location='US',
job_config=job_config) # API request - starts the query
query_job.result() # Waits for the query to finish
print('Country local websites results loaded to table {}'.format(table_ref.path))
for technology in config["technologies"]:
table_id=technology.lower().replace('.','_').replace(' ','_')
table_ref = client.dataset(country_id).table(table_id)
job_config.destination = table_ref
sql=f'''
SELECT DISTINCT origin
FROM
`chrome-ux-report.country_{country_id}.{crux_month}` c
LEFT JOIN
`httparchive.technologies.{ha_date}_*` t
ON
c.origin = RTRIM(t.url,'/')
WHERE origin in (
SELECT origin
FROM `{global_appearance_table}`
WHERE appearances = 1
) AND
app = '{technology}'
'''
query_job = client.query(
sql,
# Location must match that of the dataset(s) referenced in the query
# and of the destination table.
location='US',
job_config=job_config) # API request - starts the query
query_job.result() # Waits for the query to finish
print('Country technology spesific website results loaded to table {}'.format(table_ref.path))