Skip to content

Commit

Permalink
Create diffs between BSA downloads
Browse files Browse the repository at this point in the history
Compare the latest BSA download and the previous one to discover:

* New and deleted orders

* New and deleted labels as well as konwn labels referenced by new orders

The diff is stored on GCS as two files, one for orders and one for
diffs.
  • Loading branch information
weiminyu committed Nov 13, 2023
1 parent c918fa1 commit e29f57d
Show file tree
Hide file tree
Showing 7 changed files with 628 additions and 13 deletions.
253 changes: 253 additions & 0 deletions core/src/main/java/google/registry/bsa/BsaDiffCreator.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,253 @@
// Copyright 2023 The Nomulus Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package google.registry.bsa;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Verify.verify;
import static com.google.common.collect.ImmutableList.toImmutableList;
import static com.google.common.collect.ImmutableSet.toImmutableSet;
import static com.google.common.collect.Maps.newHashMap;
import static com.google.common.collect.Multimaps.newListMultimap;
import static com.google.common.collect.Multimaps.toMultimap;

import com.google.auto.value.AutoValue;
import com.google.common.base.Splitter;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMultimap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;
import google.registry.bsa.api.Label;
import google.registry.bsa.api.Label.LabelType;
import google.registry.bsa.api.Order;
import google.registry.bsa.api.Order.OrderType;
import google.registry.bsa.persistence.DownloadSchedule;
import google.registry.bsa.persistence.DownloadSchedule.CompletedJob;
import google.registry.tldconfig.idn.IdnTableEnum;
import java.util.HashMap;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.stream.Stream;
import javax.inject.Inject;

/** Creates diffs between the most recent download and the previous one. */
class BsaDiffCreator {

private static final Splitter LINE_SPLITTER = Splitter.on(',').trimResults();
private static final Splitter ORDER_SPLITTER = Splitter.on(';').trimResults();

private static final String BSA_CSV_HEADER = "domainLabel,orderIDs";

/** An impossible value for order ID. See {@link #createDiff} for usage. */
static final Long ORDER_ID_SENTINEL = Long.MIN_VALUE;

private final GcsClient gcsClient;

@Inject
BsaDiffCreator(GcsClient gcsClient) {
this.gcsClient = gcsClient;
}

private <K, V extends Comparable> Multimap<K, V> listBackedMultiMap() {
return newListMultimap(newHashMap(), Lists::newArrayList);
}

BsaDiff createDiff(DownloadSchedule schedule, IdnChecker idnChecker) {
String currentJobName = schedule.jobName();
Optional<String> previousJobName = schedule.latestCompleted().map(CompletedJob::jobName);
/**
* Memory usage is a concern when creating a diff, when the newest download needs to be held in
* memory in its entirety. The top-grade AppEngine VM has 3GB of memory, leaving less than 1.5GB
* to application memory footprint after subtracting overheads due to copying garbage collection
* and non-heap data etc. Assuming 400K labels, each of which on average included in 5 orders,
* the memory footprint is at least 300MB when loaded into a Hashset-backed Multimap (64-bit
* JVM, with 12-byte object header, 16-byte array header, and 16-byte alignment).
*
* <p>The memory footprint can be reduced in two ways, by using a canonical instance for each
* order ID value, and by using a ArrayList-backed Multimap. Together they reduce memory size to
* well below 100MB for the scenario above.
*
* <p>We need to watch out for the download sizes even after the migration to GKE. However, at
* that point we will have a wider selection of hardware.
*
* <p>Beam pipeline is not a good option. It has to be launched as a separate, asynchronous job,
* and there is no guaranteed limit to launch delay. Both issues would increase code complexity.
*/
Canonicals<Long> canonicals = new Canonicals<>();
try (Stream<Line> currentStream = loadBlockLists(currentJobName);
Stream<Line> previousStream =
previousJobName.map(this::loadBlockLists).orElseGet(Stream::of)) {
/**
* Load current label/order pairs into a multimap, which will contain both new labels and
* those that stay on when processing is done.
*/
Multimap<String, Long> newAndRemaining =
currentStream
.map(line -> line.labelOrderPairs(canonicals))
.flatMap(x -> x)
.collect(
toMultimap(
LabelOrderPair::label, LabelOrderPair::orderId, this::listBackedMultiMap));

Multimap<String, Long> deleted =
previousStream
.map(
line -> {
// Mark labels that exist in both downloads with the SENTINEL id. This helps
// distinguish existing label with new order from new labels.
if (newAndRemaining.containsKey(line.label())
&& !newAndRemaining.containsEntry(line.label(), ORDER_ID_SENTINEL)) {
newAndRemaining.put(line.label(), ORDER_ID_SENTINEL);
}
return line;
})
.map(line -> line.labelOrderPairs(canonicals))
.flatMap(x -> x)
.filter(kv -> !newAndRemaining.remove(kv.label(), kv.orderId()))
.collect(
toMultimap(
LabelOrderPair::label, LabelOrderPair::orderId, this::listBackedMultiMap));

/**
* Labels in `newAndRemaining`:
*
* <ul>
* <li>Mapped to `sentinel` only: Labels without change, ignore
* <li>Mapped to `sentinel` and some orders: Existing labels with new order mapping. Those
* orders are new orders.
* <li>Mapped to some orders but not `sentinel`: New labels and new orders.
* </ul>
*
* <p>The `deleted` map has
*
* <ul>
* <li>Deleted labels: the keyset of deleted minus the keyset of the newAndRemaining
* <li>Deleted orders: the union of values.
* </ul>
*/
return new BsaDiff(
ImmutableMultimap.copyOf(newAndRemaining), ImmutableMultimap.copyOf(deleted), idnChecker);
}
}

Stream<Line> loadBlockLists(String jobName) {
return Stream.of(BlockList.values())
.map(blockList -> gcsClient.readBlockList(jobName, blockList))
.flatMap(x -> x)
.filter(line -> !line.startsWith(BSA_CSV_HEADER))
.map(BsaDiffCreator::parseLine);
}

static Line parseLine(String line) {
List<String> columns = LINE_SPLITTER.splitToList(line);
checkArgument(columns.size() == 2, "Invalid line: [%s]", line);
checkArgument(!Strings.isNullOrEmpty(columns.get(0)), "Missing label in line: [%s]", line);
ImmutableList<Long> orderIds =
ORDER_SPLITTER.splitToStream(columns.get(1)).map(Long::valueOf).collect(toImmutableList());

Check notice

Code scanning / CodeQL

Missing catch of NumberFormatException Note

Potential uncaught 'java.lang.NumberFormatException'.
checkArgument(!orderIds.isEmpty(), "Missing orders in line: [%s]", line);
checkArgument(!orderIds.contains(ORDER_ID_SENTINEL), "Invalid order id %s", ORDER_ID_SENTINEL);
return Line.of(columns.get(0), orderIds);
}

static class BsaDiff {
private final ImmutableMultimap<String, Long> newAndRemaining;

private final ImmutableMultimap<String, Long> deleted;
private final IdnChecker idnChecker;

BsaDiff(
ImmutableMultimap<String, Long> newAndRemaining,
ImmutableMultimap<String, Long> deleted,
IdnChecker idnChecker) {
this.newAndRemaining = newAndRemaining;
this.deleted = deleted;
this.idnChecker = idnChecker;
}

Stream<Order> getOrders() {
return Stream.concat(
newAndRemaining.values().stream()
.filter(value -> !Objects.equals(ORDER_ID_SENTINEL, value))
.distinct()
.map(id -> Order.of(id, OrderType.CREATE)),
deleted.values().stream().distinct().map(id -> Order.of(id, OrderType.DELETE)));
}

Stream<Label> getLabels() {
return Stream.concat(
newAndRemaining.asMap().entrySet().stream()
.filter(e -> e.getValue().size() > 1 || !e.getValue().contains(ORDER_ID_SENTINEL))
.map(
entry -> {
verify(!entry.getValue().isEmpty(), "Unexpected empty set");
LabelType labelType =
entry.getValue().contains(ORDER_ID_SENTINEL)
? LabelType.NEW_ORDER_ASSOCIATION
: LabelType.CREATE;
return Label.of(
entry.getKey(),
labelType,
idnChecker.getAllValidIdns(entry.getKey()).stream()
.map(IdnTableEnum::name)
.collect(toImmutableSet()));
}),
Sets.difference(deleted.keySet(), newAndRemaining.keySet()).stream()
.map(label -> Label.of(label, LabelType.DELETE, ImmutableSet.of())));
}
}

static class Canonicals<T> {
private final HashMap<T, T> cache;

Canonicals() {
cache = Maps.newHashMap();
}

T get(T value) {
cache.putIfAbsent(value, value);
return cache.get(value);
}
}

@AutoValue
abstract static class LabelOrderPair {
abstract String label();

abstract Long orderId();

static <K, V> LabelOrderPair of(String key, Long value) {
return new AutoValue_BsaDiffCreator_LabelOrderPair(key, value);
}
}

@AutoValue
abstract static class Line {
abstract String label();

abstract ImmutableList<Long> orderIds();

Stream<LabelOrderPair> labelOrderPairs(Canonicals<Long> canonicals) {
return orderIds().stream().map(id -> LabelOrderPair.of(label(), canonicals.get(id)));
}

static Line of(String label, ImmutableList<Long> orderIds) {
return new AutoValue_BsaDiffCreator_Line(label, orderIds);
}
}
}
14 changes: 13 additions & 1 deletion core/src/main/java/google/registry/bsa/BsaDownloadAction.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.flogger.FluentLogger;
import dagger.Lazy;
import google.registry.bsa.BlockListFetcher.LazyBlockList;
import google.registry.bsa.BsaDiffCreator.BsaDiff;
import google.registry.bsa.persistence.DownloadSchedule;
import google.registry.bsa.persistence.DownloadScheduler;
import google.registry.request.Action;
Expand All @@ -44,20 +46,26 @@ public class BsaDownloadAction implements Runnable {

private final DownloadScheduler downloadScheduler;
private final BlockListFetcher blockListFetcher;
private final BsaDiffCreator diffCreator;
private final GcsClient gcsClient;
private final Lazy<IdnChecker> lazyIdnChecker;
private final BsaLock bsaLock;
private final Response response;

@Inject
BsaDownloadAction(
DownloadScheduler downloadScheduler,
BlockListFetcher blockListFetcher,
BsaDiffCreator diffCreator,
GcsClient gcsClient,
Lazy<IdnChecker> lazyIdnChecker,
BsaLock bsaLock,
Response response) {
this.downloadScheduler = downloadScheduler;
this.blockListFetcher = blockListFetcher;
this.diffCreator = diffCreator;
this.gcsClient = gcsClient;
this.lazyIdnChecker = lazyIdnChecker;
this.bsaLock = bsaLock;
this.response = response;
}
Expand Down Expand Up @@ -113,7 +121,11 @@ Void runWithinLock() {
}
// Fall through
case MAKE_DIFF:
// TODO(11/30/2023): fill out the rest stages.
BsaDiff diff = diffCreator.createDiff(schedule, lazyIdnChecker.get());
gcsClient.writeOrderDiffs(schedule.jobName(), diff.getOrders());
gcsClient.writeLabelDiffs(schedule.jobName(), diff.getLabels());
schedule.updateJobStage(DownloadStage.APPLY_DIFF);
// Fall through
case APPLY_DIFF:
case START_UPLOADING:
case UPLOAD_DOMAINS_IN_USE:
Expand Down
14 changes: 9 additions & 5 deletions core/src/main/java/google/registry/bsa/api/Label.java
Original file line number Diff line number Diff line change
Expand Up @@ -42,18 +42,22 @@ public String serialize() {

public static Label deserialize(String text) {
List<String> items = SPLITTER.splitToList(text);
return of(
items.get(0),
LabelType.valueOf(items.get(1)),
ImmutableSet.copyOf(items.subList(2, items.size())));
try {
return of(
items.get(0),
LabelType.valueOf(items.get(1)),
ImmutableSet.copyOf(items.subList(2, items.size())));
} catch (NumberFormatException ne) {
throw new IllegalArgumentException(text);
}
}

public static Label of(String label, LabelType type, ImmutableSet<String> idnTables) {
return new AutoValue_Label(label, type, idnTables);
}

public enum LabelType {
ADD,
CREATE,
NEW_ORDER_ASSOCIATION,
DELETE;
}
Expand Down
6 changes: 5 additions & 1 deletion core/src/main/java/google/registry/bsa/api/Order.java
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,11 @@ public String serialize() {

public static Order deserialize(String text) {
List<String> items = SPLITTER.splitToList(text);
return of(Long.valueOf(items.get(0)), OrderType.valueOf(items.get(1)));
try {
return of(Long.valueOf(items.get(0)), OrderType.valueOf(items.get(1)));
} catch (NumberFormatException ne) {
throw new IllegalArgumentException(text);
}
}

public static Order of(long orderId, OrderType orderType) {
Expand Down
Loading

0 comments on commit e29f57d

Please sign in to comment.