Skip to content

Commit

Permalink
Add batching, to not go over 10MB size
Browse files Browse the repository at this point in the history
  • Loading branch information
chmielsen committed Jul 23, 2024
1 parent e1738d4 commit 1e5e9e5
Showing 1 changed file with 25 additions and 3 deletions.
28 changes: 25 additions & 3 deletions plugin_scripts/insert_rows.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,25 @@
import json
import logging
import sys
from itertools import islice

from google.cloud import bigquery


from .config import Config, read_config

sys.tracebacklimit = 0

BATCH_SIZE = 20000

def batched(iterable, n):
# batched('ABCDEFG', 3) → ABC DEF G
if n < 1:
raise ValueError('n must be at least one')
iterator = iter(iterable)
while batch := tuple(islice(iterator, n)):
yield batch


def insert_rows(config: Config) -> None:
"""
Expand All @@ -31,14 +43,24 @@ def insert_rows(config: Config) -> None:
with open(config.bq_rows_as_json_path, "r") as row_file:
rows = json.load(row_file)

logging.info(f"Loaded {len(rows)} rows. Inserting...")
if not isinstance(rows, list):
raise ValueError(f"Expected JSON file to be a list of rows, was: {type(rows)}")



logging.info(f"Loaded {len(rows)} rows. Inserting in batches {BATCH_SIZE}...")

total_errors = []
for batch in batched(rows, BATCH_SIZE):
errors = client.insert_rows_json(table_ref, batch)
total_errors.extend(errors)

errors = client.insert_rows_json(table_ref, rows)

logging.info(f"Inserted rows with {len(errors)} errors")
for e in errors:
for e in total_errors:
logging.error(e)
if len(errors) > 0:
if len(total_errors) > 0:
raise Exception("Got exceptions on returning rows, see above.")


Expand Down

0 comments on commit 1e5e9e5

Please sign in to comment.