Skip to content

Commit 0921e9f

Browse files
authored
Merge pull request #1 from acmiyaguchi/update
Remove bigquery-etl resolution
2 parents 3801239 + 7204f8e commit 0921e9f

File tree

4 files changed

+0
-61
lines changed

4 files changed

+0
-61
lines changed

.gitmodules

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +0,0 @@
1-
[submodule "bigquery-etl"]
2-
path = bigquery-etl
3-
url = [email protected]:mozilla/bigquery-etl.git

bigquery-etl

Lines changed: 0 additions & 1 deletion
This file was deleted.

etl-graph/__main__.py

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
from .crawler import (
99
fetch_dataset_listing,
1010
fetch_table_listing,
11-
resolve_bigquery_etl_references,
1211
resolve_view_references,
1312
)
1413
from .utils import ensure_folder, ndjson_load, print_json, qualify, run, run_query
@@ -37,15 +36,6 @@ def crawl():
3736
resolve_view_references(views_listing, data_root / project)
3837

3938

40-
@cli.command()
41-
def etl():
42-
"""Crawl bigquery-etl."""
43-
# this is basically dryrun, but with some data collection baked in.
44-
resolve_bigquery_etl_references(
45-
ROOT / "bigquery-etl", ensure_folder(ROOT / "data" / "bigquery_etl")
46-
)
47-
48-
4939
@cli.command()
5040
def query_logs():
5141
"""Create edgelist from jobs by project query logs."""

etl-graph/crawler.py

Lines changed: 0 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -97,46 +97,6 @@ def _view_dryrun(view_root, view):
9797
json.dump(subset, fp, indent=2)
9898

9999

100-
def _bigquery_etl_dryrun(output_root: Path, query: Path):
101-
# this makes the assumption that the query writes to a destination table
102-
# relative to the path in the repository
103-
project_id = "moz-fx-data-shared-prod"
104-
dataset_id = query.parent.parent.name
105-
table_id = query.parent.name
106-
base_query = query.read_text()
107-
data = None
108-
# TODO: set the following parameters
109-
# submission_date, n_clients, sample_size, min_sample_id, max_sample_id
110-
try:
111-
result = run(
112-
[
113-
"bq",
114-
"query",
115-
f"--project_id={project_id}",
116-
"--format=json",
117-
"--use_legacy_sql=false",
118-
"--dry_run",
119-
base_query,
120-
]
121-
)
122-
data = json.loads(result)
123-
except Exception as e:
124-
print(e)
125-
if not data:
126-
print(
127-
f"unable to resolve query {query.relative_to(query.parent.parent.parent)}"
128-
)
129-
return
130-
with (output_root / f"{dataset_id}.{table_id}.json").open("w") as fp:
131-
subset = data["statistics"]
132-
del subset["query"]["schema"]
133-
subset = {
134-
**dict(projectId=project_id, tableId=table_id, datasetId=dataset_id),
135-
**subset,
136-
}
137-
json.dump(subset, fp, indent=2)
138-
139-
140100
def resolve_view_references(view_listing, project_root):
141101
# we don't really care about intermediate files
142102
view_root = Path(tempfile.mkdtemp())
@@ -154,10 +114,3 @@ def resolve_view_references(view_listing, project_root):
154114
data = json.load(view.open())
155115
fp.write(json.dumps(data))
156116
fp.write("\n")
157-
158-
159-
def resolve_bigquery_etl_references(bigquery_etl_root: Path, output_root: Path):
160-
queries = list(bigquery_etl_root.glob("sql/**/query.sql"))
161-
query_root = ensure_folder(output_root / "query")
162-
for query in queries:
163-
_bigquery_etl_dryrun(query_root, query)

0 commit comments

Comments
 (0)