Merge pull request #1 from acmiyaguchi/update

acmiyaguchi · web-flow · commit 0921e9f9ac99 · 2020-12-22T17:03:23.000-08:00
Remove bigquery-etl resolution
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +0,0 @@
-[submodule "bigquery-etl"]
-	path = bigquery-etl
-	url = git@github.com:mozilla/bigquery-etl.git
diff --git a/bigquery-etl b/bigquery-etl
diff --git a/etl-graph/__main__.py b/etl-graph/__main__.py
@@ -8,7 +8,6 @@
 from .crawler import (
     fetch_dataset_listing,
     fetch_table_listing,
-    resolve_bigquery_etl_references,
     resolve_view_references,
 )
 from .utils import ensure_folder, ndjson_load, print_json, qualify, run, run_query
@@ -37,15 +36,6 @@ def crawl():
     resolve_view_references(views_listing, data_root / project)
 
 
-@cli.command()
-def etl():
-    """Crawl bigquery-etl."""
-    # this is basically dryrun, but with some data collection baked in.
-    resolve_bigquery_etl_references(
-        ROOT / "bigquery-etl", ensure_folder(ROOT / "data" / "bigquery_etl")
-    )
-
-
 @cli.command()
 def query_logs():
     """Create edgelist from jobs by project query logs."""
diff --git a/etl-graph/crawler.py b/etl-graph/crawler.py
@@ -97,46 +97,6 @@ def _view_dryrun(view_root, view):
         json.dump(subset, fp, indent=2)
 
 
-def _bigquery_etl_dryrun(output_root: Path, query: Path):
-    # this makes the assumption that the query writes to a destination table
-    # relative to the path in the repository
-    project_id = "moz-fx-data-shared-prod"
-    dataset_id = query.parent.parent.name
-    table_id = query.parent.name
-    base_query = query.read_text()
-    data = None
-    # TODO: set the following parameters
-    # submission_date, n_clients, sample_size, min_sample_id, max_sample_id
-    try:
-        result = run(
-            [
-                "bq",
-                "query",
-                f"--project_id={project_id}",
-                "--format=json",
-                "--use_legacy_sql=false",
-                "--dry_run",
-                base_query,
-            ]
-        )
-        data = json.loads(result)
-    except Exception as e:
-        print(e)
-    if not data:
-        print(
-            f"unable to resolve query {query.relative_to(query.parent.parent.parent)}"
-        )
-        return
-    with (output_root / f"{dataset_id}.{table_id}.json").open("w") as fp:
-        subset = data["statistics"]
-        del subset["query"]["schema"]
-        subset = {
-            **dict(projectId=project_id, tableId=table_id, datasetId=dataset_id),
-            **subset,
-        }
-        json.dump(subset, fp, indent=2)
-
-
 def resolve_view_references(view_listing, project_root):
     # we don't really care about intermediate files
     view_root = Path(tempfile.mkdtemp())
@@ -154,10 +114,3 @@ def resolve_view_references(view_listing, project_root):
             data = json.load(view.open())
             fp.write(json.dumps(data))
             fp.write("\n")
-
-
-def resolve_bigquery_etl_references(bigquery_etl_root: Path, output_root: Path):
-    queries = list(bigquery_etl_root.glob("sql/**/query.sql"))
-    query_root = ensure_folder(output_root / "query")
-    for query in queries:
-        _bigquery_etl_dryrun(query_root, query)

Original file line number	Diff line number	Diff line change
`@@ -1,3 +0,0 @@`
`1`		`-[submodule "bigquery-etl"]`
`2`		`- path = bigquery-etl`
`3`		`- url = [email protected]:mozilla/bigquery-etl.git`