From a74a1c24e842b7e6ac240cf7af97d3a305b320b8 Mon Sep 17 00:00:00 2001 From: Thanh Le Date: Wed, 27 Nov 2024 14:59:21 +0100 Subject: [PATCH 01/18] Refactor out mongo forum from ingestor forum --- modules/ingestor/src/main/scala/cli.scala | 5 +- .../src/main/scala/ingestor.forum.scala | 167 ++---------------- .../ingestor/src/main/scala/ingestor.scala | 5 +- .../ingestor/src/main/scala/mongo.forum.scala | 166 +++++++++++++++++ 4 files changed, 191 insertions(+), 152 deletions(-) create mode 100644 modules/ingestor/src/main/scala/mongo.forum.scala diff --git a/modules/ingestor/src/main/scala/cli.scala b/modules/ingestor/src/main/scala/cli.scala index 4ecadad4..071a2ab9 100644 --- a/modules/ingestor/src/main/scala/cli.scala +++ b/modules/ingestor/src/main/scala/cli.scala @@ -32,8 +32,9 @@ object cli for config <- AppConfig.load.toResource res <- AppResources.instance(config) - forum <- ForumIngestor(res.lichess, res.elastic, res.store, config.ingestor.forum).toResource - team <- TeamIngestor(res.lichess, res.elastic, res.store, config.ingestor.team).toResource + forums <- Forums(res.lichess, config.ingestor.forum).toResource + forum = ForumIngestor(res.elastic, res.store, config.ingestor.forum, forums) + team <- TeamIngestor(res.lichess, res.elastic, res.store, config.ingestor.team).toResource study <- StudyIngestor( res.study, res.studyLocal, diff --git a/modules/ingestor/src/main/scala/ingestor.forum.scala b/modules/ingestor/src/main/scala/ingestor.forum.scala index 1d5c73b8..9941c7f0 100644 --- a/modules/ingestor/src/main/scala/ingestor.forum.scala +++ b/modules/ingestor/src/main/scala/ingestor.forum.scala @@ -3,17 +3,10 @@ package ingestor import cats.effect.IO import cats.syntax.all.* -import com.mongodb.client.model.changestream.FullDocument -import com.mongodb.client.model.changestream.OperationType.* -import mongo4cats.bson.Document -import mongo4cats.database.MongoDatabase -import mongo4cats.models.collection.ChangeStreamDocument -import mongo4cats.operations.{ Aggregate, Filter, Projection } import org.typelevel.log4cats.syntax.* import org.typelevel.log4cats.{ Logger, LoggerFactory } import java.time.Instant -import scala.concurrent.duration.* trait ForumIngestor: // watch change events from MongoDB and ingest forum posts into elastic search @@ -25,74 +18,38 @@ object ForumIngestor: private val index = Index.Forum - private val interestedOperations = List(DELETE, INSERT, REPLACE, UPDATE).map(_.getValue) + def apply(elastic: ESClient[IO], store: KVStore, config: IngestorConfig.Forum, forums: Forums)(using + LoggerFactory[IO] + ): ForumIngestor = new: - private def maxPostSizeFilter(max: Int) = - Filter.expr(s"{ $$lte: [{ $$strLenCP: '$$fullDocument.text' }, $max] }") - - private def eventFilter(maxPostLength: Int) = - Filter.in("operationType", interestedOperations) && maxPostSizeFilter(maxPostLength) - - private val interestedFields = List(_id, F.text, F.topicId, F.troll, F.createdAt, F.userId, F.erasedAt) - private val postProjection = Projection.include(interestedFields) - - private val interestedEventFields = - List("operationType", "clusterTime", "documentKey._id") ++ interestedFields.map("fullDocument." + _) - private val eventProjection = Projection.include(interestedEventFields) - - private def aggregate(maxPostLength: Int) = - Aggregate.matchBy(eventFilter(maxPostLength)).combinedWith(Aggregate.project(eventProjection)) - - def apply(mongo: MongoDatabase[IO], elastic: ESClient[IO], store: KVStore, config: IngestorConfig.Forum)( - using LoggerFactory[IO] - ): IO[ForumIngestor] = - given Logger[IO] = summon[LoggerFactory[IO]].getLogger - (mongo.getCollection("f_topic"), mongo.getCollection("f_post")).mapN(apply(elastic, store, config)) - - def apply(elastic: ESClient[IO], store: KVStore, config: IngestorConfig.Forum)( - topics: MongoCollection, - posts: MongoCollection - )(using Logger[IO]): ForumIngestor = new: + given Logger[IO] = LoggerFactory[IO].getLogger def watch: fs2.Stream[IO, Unit] = fs2.Stream .eval(startAt.flatTap(since => info"Starting forum ingestor from $since")) .flatMap: last => - changes(last) - .evalMap: events => - val lastEventTimestamp = events.flatten(_.clusterTime.flatMap(_.asInstant)).maxOption - val (toDelete, toIndex) = events.partition(_.isDelete) - storeBulk(toIndex.flatten(_.fullDocument)) - *> elastic.deleteMany(index, toDelete) - *> saveLastIndexedTimestamp(lastEventTimestamp.getOrElse(Instant.now())) + forums + .watch(last) + .evalMap: result => + storeBulk(result.toIndex) + *> elastic.deleteMany(index, result.toDelete) + *> saveLastIndexedTimestamp(result.timestamp.getOrElse(Instant.now())) def run(since: Instant, until: Instant, dryRun: Boolean): fs2.Stream[IO, Unit] = - val filter = range(F.createdAt)(since, until.some) - .or(range(F.updatedAt)(since, until.some)) - .or(range(F.erasedAt)(since, until.some)) - posts - .find(filter) - .projection(postProjection) - .boundedStream(config.batchSize) - .filter(_.validText) - .chunkN(config.batchSize) - .map(_.toList) - .metered(1.second) // to avoid overloading the elasticsearch - .evalMap: docs => - val (toDelete, toIndex) = docs.partition(_.isErased) + forums + .fetch(since, until) + .evalMap: result => dryRun.fold( - toIndex.traverse_(doc => debug"Would index $doc") - *> toDelete.traverse_(doc => debug"Would delete $doc"), - storeBulk(toIndex) *> elastic.deleteMany(index, toDelete) + result.toIndex.traverse_(doc => debug"Would index $doc") + *> result.toDelete.traverse_(doc => debug"Would delete $doc"), + storeBulk(result.toIndex) *> elastic.deleteMany(index, result.toDelete) ) - private def storeBulk(docs: List[Document]): IO[Unit] = + private def storeBulk(docs: List[(String, ForumSource)]): IO[Unit] = info"Received ${docs.size} forum posts to index" *> - docs.toSources - .flatMap: sources => - elastic.storeBulk(index, sources) *> info"Indexed ${sources.size} forum posts" + elastic.storeBulk(index, docs) *> info"Indexed ${docs.size} forum posts" .handleErrorWith: e => - Logger[IO].error(e)(s"Failed to index forum posts: ${docs.map(_.id).mkString(", ")}") + Logger[IO].error(e)(s"Failed to index forum posts: ${docs.map(_._1).mkString(", ")}") .whenA(docs.nonEmpty) private def saveLastIndexedTimestamp(time: Instant): IO[Unit] = @@ -101,89 +58,3 @@ object ForumIngestor: private def startAt: IO[Option[Instant]] = config.startAt.fold(store.get(index.value))(_.some.pure[IO]) - - // Fetches topic names by their ids - private def topicByIds(ids: Seq[String]): IO[Map[String, String]] = - topics - .find(Filter.in(_id, ids)) - .projection(Projection.include(List(_id, Topic.name))) - .all - .map(_.map(doc => (doc.id, doc.getString(Topic.name)).mapN(_ -> _)).flatten.toMap) - - private def changes(since: Option[Instant]): fs2.Stream[IO, List[ChangeStreamDocument[Document]]] = - val builder = posts.watch(aggregate(config.maxPostLength)) - // skip the first event if we're starting from a specific timestamp - // since the event at that timestamp is already indexed - val skip = since.fold(0)(_ => 1) - since - .fold(builder)(x => builder.startAtOperationTime(x.asBsonTimestamp)) - .fullDocument(FullDocument.UPDATE_LOOKUP) // this is required for update event - .batchSize(config.batchSize) - .boundedStream(config.batchSize) - .drop(skip) - .groupWithin(config.batchSize, config.timeWindows.second) - .evalTap(_.traverse_(x => debug"received $x")) - .map(_.toList.distincByDocId) - - private type SourceWithId = (String, ForumSource) - - extension (events: List[Document]) - private def toSources: IO[List[SourceWithId]] = - val topicIds = events.flatMap(_.topicId).distinct - topicIds.isEmpty.fold( - info"no topics found for posts: $events".as(Nil), - topicByIds(topicIds) - .flatMap: topicMap => - events - .traverse(_.toSource(topicMap)) - .map(_.flatten) - ) - - extension (doc: Document) - - private def toSource(topicMap: Map[String, String]): IO[Option[SourceWithId]] = - (doc.id, doc.topicId) - .flatMapN: (id, topicId) => - doc.toSource(topicMap.get(topicId), topicId).map(id -> _) - .pure[IO] - .flatTap: source => - def reason = doc.id.fold("missing doc._id; ")(_ => "") - + doc.topicId.fold("missing doc.topicId; ")(_ => "") - + doc.topicId - .map(id => topicMap.get(id).fold("topic or topic.name is missing")(_ => "")) - .getOrElse("") - info"failed to convert document to source: $doc because $reason".whenA(source.isEmpty) - - private def toSource(topicName: Option[String], topicId: String): Option[ForumSource] = - ( - doc.getString(F.text), - topicName, - doc.getBoolean(F.troll), - doc.getNested(F.createdAt).flatMap(_.asInstant).map(_.toEpochMilli), - doc.getString(F.userId).some - ).mapN(ForumSource.apply(_, _, topicId, _, _, _)) - - private def isErased: Boolean = - doc.get("erasedAt").isDefined - - private def topicId: Option[String] = - doc.getString(F.topicId) - - private def validText: Boolean = - doc.getString(F.text).exists(_.length <= config.maxPostLength) - - extension (event: ChangeStreamDocument[Document]) - private def isDelete: Boolean = - event.operationType == DELETE || event.fullDocument.exists(_.isErased) - - object F: - val text = "text" - val topicId = "topicId" - val troll = "troll" - val userId = "userId" - val createdAt = "createdAt" - val updatedAt = "updatedAt" - val erasedAt = "erasedAt" - - object Topic: - val name = "name" diff --git a/modules/ingestor/src/main/scala/ingestor.scala b/modules/ingestor/src/main/scala/ingestor.scala index bc85b880..ec53e17d 100644 --- a/modules/ingestor/src/main/scala/ingestor.scala +++ b/modules/ingestor/src/main/scala/ingestor.scala @@ -20,11 +20,12 @@ object Ingestor: config: IngestorConfig )(using LoggerFactory[IO]): IO[Ingestor] = ( - ForumIngestor(lichess, elastic, store, config.forum), + Forums(lichess, config.forum), TeamIngestor(lichess, elastic, store, config.team), StudyIngestor(study, local, elastic, store, config.study), GameIngestor(lichess, elastic, store, config.game) - ).mapN: (forum, team, study, game) => + ).mapN: (forums, team, study, game) => + val forum = ForumIngestor(elastic, store, config.forum, forums) new Ingestor: def run() = fs2 diff --git a/modules/ingestor/src/main/scala/mongo.forum.scala b/modules/ingestor/src/main/scala/mongo.forum.scala new file mode 100644 index 00000000..60b896fa --- /dev/null +++ b/modules/ingestor/src/main/scala/mongo.forum.scala @@ -0,0 +1,166 @@ +package lila.search +package ingestor + +import cats.effect.IO +import cats.syntax.all.* +import com.mongodb.client.model.changestream.FullDocument +import com.mongodb.client.model.changestream.OperationType.* +import mongo4cats.bson.Document +import mongo4cats.database.MongoDatabase +import mongo4cats.models.collection.ChangeStreamDocument +import mongo4cats.operations.{ Aggregate, Filter, Projection } +import org.typelevel.log4cats.syntax.* +import org.typelevel.log4cats.{ Logger, LoggerFactory } + +import java.time.Instant +import scala.concurrent.duration.* + +import Forums.Result + +trait Forums: + def watch(since: Option[Instant]): fs2.Stream[IO, Result] + def fetch(since: Instant, until: Instant): fs2.Stream[IO, Result] + +object Forums: + + private type SourceWithId = (String, ForumSource) + case class Result(toIndex: List[(String, ForumSource)], toDelete: List[Id], timestamp: Option[Instant]) + + private val interestedOperations = List(DELETE, INSERT, REPLACE, UPDATE).map(_.getValue) + + private def maxPostSizeFilter(max: Int) = + Filter.expr(s"{ $$lte: [{ $$strLenCP: '$$fullDocument.text' }, $max] }") + + private def eventFilter(maxPostLength: Int) = + Filter.in("operationType", interestedOperations) && maxPostSizeFilter(maxPostLength) + + private val interestedFields = List(_id, F.text, F.topicId, F.troll, F.createdAt, F.userId, F.erasedAt) + private val postProjection = Projection.include(interestedFields) + + private val interestedEventFields = + List("operationType", "clusterTime", "documentKey._id") ++ interestedFields.map("fullDocument." + _) + private val eventProjection = Projection.include(interestedEventFields) + + private def aggregate(maxPostLength: Int) = + Aggregate.matchBy(eventFilter(maxPostLength)).combinedWith(Aggregate.project(eventProjection)) + + def apply(mongo: MongoDatabase[IO], config: IngestorConfig.Forum)(using + LoggerFactory[IO] + ): IO[Forums] = + given Logger[IO] = LoggerFactory[IO].getLogger + (mongo.getCollection("f_topic"), mongo.getCollection("f_post")).mapN(apply(config)) + + def apply(config: IngestorConfig.Forum)( + topics: MongoCollection, + posts: MongoCollection + )(using Logger[IO]): Forums = new: + + def fetch(since: Instant, until: Instant) = + val filter = range(F.createdAt)(since, until.some) + .or(range(F.updatedAt)(since, until.some)) + .or(range(F.erasedAt)(since, until.some)) + posts + .find(filter) + .projection(postProjection) + .boundedStream(config.batchSize) + .filter(_.validText) + .chunkN(config.batchSize) + .map(_.toList) + .metered(1.second) + .evalMap: events => + val (toDelete, toIndex) = events.partition(_.isErased) + toIndex.toSources + .map: sources => + Result(sources, toDelete.flatten(_.id.map(Id.apply)), none) + + def watch(since: Option[Instant]): fs2.Stream[IO, Result] = + val builder = posts.watch(aggregate(config.maxPostLength)) + // skip the first event if we're starting from a specific timestamp + // since the event at that timestamp is already indexed + val skip = since.fold(0)(_ => 1) + since + .fold(builder)(x => builder.startAtOperationTime(x.asBsonTimestamp)) + .fullDocument(FullDocument.UPDATE_LOOKUP) // this is required for update event + .batchSize(config.batchSize) + .boundedStream(config.batchSize) + .drop(skip) + .groupWithin(config.batchSize, config.timeWindows.second) + .evalTap(_.traverse_(x => debug"received $x")) + .map(_.toList.distincByDocId) + .evalMap: events => + val lastEventTimestamp = events.flatten(_.clusterTime.flatMap(_.asInstant)).maxOption + val (toDelete, toIndex) = events.partition(_.isDelete) + toIndex + .flatten(_.fullDocument) + .toSources + .map: sources => + Result(sources, toDelete.flatten(_.docId.map(Id.apply)), lastEventTimestamp) + + // Fetches topic names by their ids + private def topicByIds(ids: Seq[String]): IO[Map[String, String]] = + topics + .find(Filter.in(_id, ids)) + .projection(Projection.include(List(_id, Topic.name))) + .all + .map(_.map(doc => (doc.id, doc.getString(Topic.name)).mapN(_ -> _)).flatten.toMap) + + extension (events: List[Document]) + private def toSources: IO[List[SourceWithId]] = + val topicIds = events.flatMap(_.topicId).distinct + topicIds.isEmpty.fold( + info"no topics found for posts: $events".as(Nil), + topicByIds(topicIds) + .flatMap: topicMap => + events + .traverse(_.toSource(topicMap)) + .map(_.flatten) + ) + + extension (doc: Document) + + private def toSource(topicMap: Map[String, String]): IO[Option[SourceWithId]] = + (doc.id, doc.topicId) + .flatMapN: (id, topicId) => + doc.toSource(topicMap.get(topicId), topicId).map(id -> _) + .pure[IO] + .flatTap: source => + def reason = doc.id.fold("missing doc._id; ")(_ => "") + + doc.topicId.fold("missing doc.topicId; ")(_ => "") + + doc.topicId + .map(id => topicMap.get(id).fold("topic or topic.name is missing")(_ => "")) + .getOrElse("") + info"failed to convert document to source: $doc because $reason".whenA(source.isEmpty) + + private def toSource(topicName: Option[String], topicId: String): Option[ForumSource] = + ( + doc.getString(F.text), + topicName, + doc.getBoolean(F.troll), + doc.getNested(F.createdAt).flatMap(_.asInstant).map(_.toEpochMilli), + doc.getString(F.userId).some + ).mapN(ForumSource.apply(_, _, topicId, _, _, _)) + + private def isErased: Boolean = + doc.get("erasedAt").isDefined + + private def topicId: Option[String] = + doc.getString(F.topicId) + + private def validText: Boolean = + doc.getString(F.text).exists(_.length <= config.maxPostLength) + + extension (event: ChangeStreamDocument[Document]) + private def isDelete: Boolean = + event.operationType == DELETE || event.fullDocument.exists(_.isErased) + + object F: + val text = "text" + val topicId = "topicId" + val troll = "troll" + val userId = "userId" + val createdAt = "createdAt" + val updatedAt = "updatedAt" + val erasedAt = "erasedAt" + + object Topic: + val name = "name" From 104a313e08852eeae8cfaf6d312076e24161c46a Mon Sep 17 00:00:00 2001 From: Thanh Le Date: Wed, 27 Nov 2024 15:42:52 +0100 Subject: [PATCH 02/18] Refactor study mongo --- modules/ingestor/src/main/scala/cli.scala | 11 +- .../ingestor/src/main/scala/ingestor.scala | 5 +- .../src/main/scala/ingestor.study.scala | 168 +++--------------- .../ingestor/src/main/scala/mongo.study.scala | 152 ++++++++++++++++ 4 files changed, 185 insertions(+), 151 deletions(-) create mode 100644 modules/ingestor/src/main/scala/mongo.study.scala diff --git a/modules/ingestor/src/main/scala/cli.scala b/modules/ingestor/src/main/scala/cli.scala index 071a2ab9..c24fe20c 100644 --- a/modules/ingestor/src/main/scala/cli.scala +++ b/modules/ingestor/src/main/scala/cli.scala @@ -34,14 +34,9 @@ object cli res <- AppResources.instance(config) forums <- Forums(res.lichess, config.ingestor.forum).toResource forum = ForumIngestor(res.elastic, res.store, config.ingestor.forum, forums) - team <- TeamIngestor(res.lichess, res.elastic, res.store, config.ingestor.team).toResource - study <- StudyIngestor( - res.study, - res.studyLocal, - res.elastic, - res.store, - config.ingestor.study - ).toResource + team <- TeamIngestor(res.lichess, res.elastic, res.store, config.ingestor.team).toResource + studies <- Studies(res.study, res.studyLocal, config.ingestor.study).toResource + study = StudyIngestor(studies, res.elastic, res.store, config.ingestor.study) game <- GameIngestor(res.lichess, res.elastic, res.store, config.ingestor.game).toResource yield Executor(forum, study, game, team) diff --git a/modules/ingestor/src/main/scala/ingestor.scala b/modules/ingestor/src/main/scala/ingestor.scala index ec53e17d..0aa13fc5 100644 --- a/modules/ingestor/src/main/scala/ingestor.scala +++ b/modules/ingestor/src/main/scala/ingestor.scala @@ -21,11 +21,12 @@ object Ingestor: )(using LoggerFactory[IO]): IO[Ingestor] = ( Forums(lichess, config.forum), + Studies(study, local, config.study), TeamIngestor(lichess, elastic, store, config.team), - StudyIngestor(study, local, elastic, store, config.study), GameIngestor(lichess, elastic, store, config.game) - ).mapN: (forums, team, study, game) => + ).mapN: (forums, studies, team, game) => val forum = ForumIngestor(elastic, store, config.forum, forums) + val study = StudyIngestor(studies, elastic, store, config.study) new Ingestor: def run() = fs2 diff --git a/modules/ingestor/src/main/scala/ingestor.study.scala b/modules/ingestor/src/main/scala/ingestor.study.scala index 5ed29808..52939400 100644 --- a/modules/ingestor/src/main/scala/ingestor.study.scala +++ b/modules/ingestor/src/main/scala/ingestor.study.scala @@ -3,9 +3,6 @@ package ingestor import cats.effect.IO import cats.syntax.all.* -import mongo4cats.bson.Document -import mongo4cats.database.MongoDatabase -import mongo4cats.operations.{ Filter, Projection } import org.typelevel.log4cats.syntax.* import org.typelevel.log4cats.{ Logger, LoggerFactory } @@ -20,156 +17,45 @@ object StudyIngestor: private val index = Index.Study - private val interestedfields = List("_id", F.name, F.members, F.ownerId, F.visibility, F.topics, F.likes) - - private val indexDocProjection = Projection.include(interestedfields) - private val deleteDocProjection = Projection.include(F.oplogId) - def apply( - study: MongoDatabase[IO], - local: MongoDatabase[IO], + studies: Studies, elastic: ESClient[IO], store: KVStore, config: IngestorConfig.Study - )(using LoggerFactory[IO]): IO[StudyIngestor] = - given Logger[IO] = summon[LoggerFactory[IO]].getLogger - (study.getCollection("study"), ChapterRepo(study), local.getCollection("oplog.rs")) - .mapN(apply(elastic, store, config)) - - def apply(elastic: ESClient[IO], store: KVStore, config: IngestorConfig.Study)( - studies: MongoCollection, - chapters: ChapterRepo, - oplogs: MongoCollection - )(using Logger[IO]): StudyIngestor = new: + )(using LoggerFactory[IO]): StudyIngestor = new: + given Logger[IO] = LoggerFactory[IO].getLogger def watch: fs2.Stream[IO, Unit] = - intervalStream - .meteredStartImmediately(config.interval) - .flatMap: (since, until) => - run(since, until, dryRun = false) + fs2.Stream + .eval( + config.startAt.fold(store.get(index.value))(_.some.pure[IO]) + ) + .flatMap: since => + studies + .watch(since) + .evalMap: result => + storeBulk(result.toIndex, false) *> elastic.deleteMany(index, result.toDelete) + *> saveLastIndexedTimestamp(result.timestamp.getOrElse(Instant.now())) def run(since: Instant, until: Instant, dryRun: Boolean): fs2.Stream[IO, Unit] = - fs2.Stream.eval(info"Indexing studies from $since to $until") ++ - pullAndIndex(since, until, dryRun) ++ - fs2.Stream.eval(info"deleting studies from $since to $until") ++ - pullAndDelete(since, until, dryRun) - ++ fs2.Stream.eval(saveLastIndexedTimestamp(until)) - - def pullAndIndex(since: Instant, until: Instant, dryRun: Boolean = false): fs2.Stream[IO, Unit] = - val filter = range(F.createdAt)(since, until.some) - .or(range(F.updatedAt)(since, until.some)) studies - .find(filter) - .projection(indexDocProjection) - .boundedStream(config.batchSize) - .chunkN(config.batchSize) - .map(_.toList) - .evalTap(_.traverse_(x => debug"received $x")) - .evalMap(storeBulk(_, dryRun)) - - def pullAndDelete(since: Instant, until: Instant, dryRun: Boolean = false): fs2.Stream[IO, Unit] = - val filter = - Filter - .gte("ts", since.asBsonTimestamp) - .and(Filter.lt("ts", until.asBsonTimestamp)) - .and(Filter.eq("ns", s"${config.databaseName}.study")) - .and(Filter.eq("op", "d")) - oplogs - .find(filter) - .projection(deleteDocProjection) - .boundedStream(config.batchSize) - .chunkN(config.batchSize) - .map(_.toList.flatMap(extractId)) - .evalTap(xs => info"Deleting $xs") - .evalMap: + .fetch(since, until) + .evalMap: result => dryRun.fold( - xs => xs.traverse_(x => debug"Would delete $x"), - elastic.deleteMany(index, _) + result.toIndex.traverse_(doc => debug"Would index $doc") + *> result.toDelete.traverse_(doc => debug"Would delete $doc"), + storeBulk(result.toIndex, dryRun) *> elastic.deleteMany(index, result.toDelete) ) - def storeBulk(docs: List[Document], dryRun: Boolean = false): IO[Unit] = - info"Received ${docs.size} studies to index" *> - docs.toSources.flatMap: sources => - dryRun.fold( - sources.traverse_(source => debug"Would index $source"), - elastic.storeBulk(index, sources) *> info"Indexed ${sources.size} studies" - .handleErrorWith: e => - Logger[IO].error(e)(s"Failed to index studies: ${docs.map(_.id).mkString(", ")}") - .whenA(docs.nonEmpty) - ) + def storeBulk(sources: List[(String, StudySource)], dryRun: Boolean = false): IO[Unit] = + info"Received ${sources.size} studies to index" *> + dryRun.fold( + sources.traverse_(source => debug"Would index $source"), + elastic.storeBulk(index, sources) *> info"Indexed ${sources.size} studies" + .handleErrorWith: e => + Logger[IO].error(e)(s"Failed to index studies: ${sources.map(_._1).mkString(", ")}") + .whenA(sources.nonEmpty) + ) def saveLastIndexedTimestamp(time: Instant): IO[Unit] = store.put(index.value, time) *> info"Stored last indexed time ${time.getEpochSecond} for $index" - - def extractId(doc: Document): Option[Id] = - doc.getNestedAs[String](F.oplogId).map(Id.apply) - - def intervalStream: fs2.Stream[IO, (Instant, Instant)] = - fs2.Stream - .eval: - config.startAt.fold(store.get(index.value))(_.some.pure[IO]) - .flatMap: startAt => - startAt.fold(fs2.Stream.empty)(since => fs2.Stream(since)) - ++ fs2.Stream - .eval(IO.realTimeInstant) - .flatMap(now => fs2.Stream.unfold(now)(s => (s, s.plusSeconds(config.interval.toSeconds)).some)) - .zipWithNext - .map((since, until) => since -> until.get) - - extension (docs: List[Document]) - private def toSources: IO[List[StudySourceWithId]] = - val studyIds = docs.flatMap(_.id).distinct - chapters - .byStudyIds(studyIds) - .flatMap: chapters => - docs - .traverseFilter(_.toSource(chapters)) - - type StudySourceWithId = (String, StudySource) - extension (doc: Document) - private def toSource(chapters: Map[String, StudyData]): IO[Option[StudySourceWithId]] = - doc.id - .flatMap: id => - ( - doc.getName, - doc.getOwnerId, - doc.getMembers.some, - doc.getChapterNames(chapters), - doc.getChapterTexts(chapters), - doc.getLikes.some, - doc.getPublic.some, - doc.getTopics.some - ) - .mapN(StudySource.apply) - .map(id -> _) - .pure[IO] - .flatTap: source => - def reason = - doc.id.fold("missing doc._id; ")(_ => "") - + doc.getName.fold("missing doc.name; ")(_ => "") - + doc.getOwnerId.fold("missing doc.ownerId; ")(_ => "") - + doc.getChapterNames(chapters).fold("missing doc.chapterNames; ")(_ => "") - + doc.getChapterTexts(chapters).fold("missing doc.chapterTexts; ")(_ => "") - info"failed to convert document to source: $doc because $reason".whenA(source.isEmpty) - - private def getName = doc.getString(F.name) - private def getOwnerId = doc.getString(F.ownerId) - private def getMembers = doc.getDocument(F.members).fold(Nil)(_.toMap.keys.toList) - private def getTopics = doc.getList(F.topics).map(_.flatMap(_.asString)).getOrElse(Nil) - private def getLikes = doc.getInt(F.likes).getOrElse(0) - private def getChapterTexts(chapters: Map[String, StudyData]) = - chapters.get(doc.id.getOrElse("")).map(_.chapterTexts) - private def getChapterNames(chapters: Map[String, StudyData]) = - chapters.get(doc.id.getOrElse("")).map(_.chapterNames) - private def getPublic = doc.getString(F.visibility).map(_ == "public").getOrElse(true) - - object F: - val name = "name" - val likes = "likes" - val members = "members" - val ownerId = "ownerId" - val visibility = "visibility" - val topics = "topics" - val createdAt = "createdAt" - val updatedAt = "updatedAt" - val oplogId = "o._id" diff --git a/modules/ingestor/src/main/scala/mongo.study.scala b/modules/ingestor/src/main/scala/mongo.study.scala new file mode 100644 index 00000000..239a9e12 --- /dev/null +++ b/modules/ingestor/src/main/scala/mongo.study.scala @@ -0,0 +1,152 @@ +package lila.search +package ingestor + +import cats.effect.IO +import cats.syntax.all.* +import mongo4cats.bson.Document +import mongo4cats.database.MongoDatabase +import mongo4cats.operations.{ Filter, Projection } +import org.typelevel.log4cats.syntax.* +import org.typelevel.log4cats.{ Logger, LoggerFactory } + +import java.time.Instant + +import Studies.Result + +trait Studies: + def watch(since: Option[Instant]): fs2.Stream[IO, Result] + def fetch(since: Instant, until: Instant): fs2.Stream[IO, Result] + +object Studies: + + private type SourceWithId = (String, StudySource) + case class Result(toIndex: List[SourceWithId], toDelete: List[Id], timestamp: Option[Instant]) + + private val interestedfields = List("_id", F.name, F.members, F.ownerId, F.visibility, F.topics, F.likes) + + private val indexDocProjection = Projection.include(interestedfields) + private val deleteDocProjection = Projection.include(F.oplogId) + + def apply( + study: MongoDatabase[IO], + local: MongoDatabase[IO], + config: IngestorConfig.Study + )(using LoggerFactory[IO]): IO[Studies] = + given Logger[IO] = summon[LoggerFactory[IO]].getLogger + (study.getCollection("study"), ChapterRepo(study), local.getCollection("oplog.rs")) + .mapN(apply(config)) + + def apply(config: IngestorConfig.Study)( + studies: MongoCollection, + chapters: ChapterRepo, + oplogs: MongoCollection + )(using Logger[IO]): Studies = new: + + def watch(since: Option[Instant]): fs2.Stream[IO, Result] = + intervalStream(since) + .meteredStartImmediately(config.interval) + .flatMap(fetch) + + def fetch(since: Instant, until: Instant): fs2.Stream[IO, Result] = + // fs2.Stream.eval(info"Indexing studies from $since to $until") ++ + // fs2.Stream.eval(info"deleting studies from $since to $until") ++ + pullAndIndex(since, until) + .zip(pullAndDelete(since, until)) + .map((toIndex, toDelete) => Result(toIndex, toDelete, until.some)) + + def pullAndIndex(since: Instant, until: Instant) = + val filter = range(F.createdAt)(since, until.some) + .or(range(F.updatedAt)(since, until.some)) + studies + .find(filter) + .projection(indexDocProjection) + .boundedStream(config.batchSize) + .chunkN(config.batchSize) + .map(_.toList) + .evalTap(_.traverse_(x => debug"received $x")) + .evalMap(_.toSources) + + def pullAndDelete(since: Instant, until: Instant) = + val filter = + Filter + .gte("ts", since.asBsonTimestamp) + .and(Filter.lt("ts", until.asBsonTimestamp)) + .and(Filter.eq("ns", s"${config.databaseName}.study")) + .and(Filter.eq("op", "d")) + oplogs + .find(filter) + .projection(deleteDocProjection) + .boundedStream(config.batchSize) + .chunkN(config.batchSize) + .map(_.toList.flatMap(extractId)) + .evalTap(xs => info"Deleting $xs") + + def extractId(doc: Document): Option[Id] = + doc.getNestedAs[String](F.oplogId).map(Id.apply) + + def intervalStream(startAt: Option[Instant]): fs2.Stream[IO, (Instant, Instant)] = + (startAt.fold(fs2.Stream.empty)(since => fs2.Stream(since)) + ++ fs2.Stream + .eval(IO.realTimeInstant) + .flatMap(now => + fs2.Stream.unfold(now)(s => (s, s.plusSeconds(config.interval.toSeconds)).some) + )).zipWithNext + .map((since, until) => since -> until.get) + + extension (docs: List[Document]) + private def toSources: IO[List[StudySourceWithId]] = + val studyIds = docs.flatMap(_.id).distinct + chapters + .byStudyIds(studyIds) + .flatMap: chapters => + docs + .traverseFilter(_.toSource(chapters)) + + type StudySourceWithId = (String, StudySource) + extension (doc: Document) + private def toSource(chapters: Map[String, StudyData]): IO[Option[StudySourceWithId]] = + doc.id + .flatMap: id => + ( + doc.getName, + doc.getOwnerId, + doc.getMembers.some, + doc.getChapterNames(chapters), + doc.getChapterTexts(chapters), + doc.getLikes.some, + doc.getPublic.some, + doc.getTopics.some + ) + .mapN(StudySource.apply) + .map(id -> _) + .pure[IO] + .flatTap: source => + def reason = + doc.id.fold("missing doc._id; ")(_ => "") + + doc.getName.fold("missing doc.name; ")(_ => "") + + doc.getOwnerId.fold("missing doc.ownerId; ")(_ => "") + + doc.getChapterNames(chapters).fold("missing doc.chapterNames; ")(_ => "") + + doc.getChapterTexts(chapters).fold("missing doc.chapterTexts; ")(_ => "") + info"failed to convert document to source: $doc because $reason".whenA(source.isEmpty) + + private def getName = doc.getString(F.name) + private def getOwnerId = doc.getString(F.ownerId) + private def getMembers = doc.getDocument(F.members).fold(Nil)(_.toMap.keys.toList) + private def getTopics = doc.getList(F.topics).map(_.flatMap(_.asString)).getOrElse(Nil) + private def getLikes = doc.getInt(F.likes).getOrElse(0) + private def getChapterTexts(chapters: Map[String, StudyData]) = + chapters.get(doc.id.getOrElse("")).map(_.chapterTexts) + private def getChapterNames(chapters: Map[String, StudyData]) = + chapters.get(doc.id.getOrElse("")).map(_.chapterNames) + private def getPublic = doc.getString(F.visibility).map(_ == "public").getOrElse(true) + + object F: + val name = "name" + val likes = "likes" + val members = "members" + val ownerId = "ownerId" + val visibility = "visibility" + val topics = "topics" + val createdAt = "createdAt" + val updatedAt = "updatedAt" + val oplogId = "o._id" From 8f61d02c190b25b8fda638613b3ac0d4671e65a2 Mon Sep 17 00:00:00 2001 From: Thanh Le Date: Wed, 27 Nov 2024 16:09:23 +0100 Subject: [PATCH 03/18] Refactor mongo game --- modules/ingestor/src/main/scala/cli.scala | 3 +- .../src/main/scala/ingestor.game.scala | 270 ++---------------- .../ingestor/src/main/scala/ingestor.scala | 7 +- .../ingestor/src/main/scala/mongo.game.scala | 253 ++++++++++++++++ 4 files changed, 283 insertions(+), 250 deletions(-) create mode 100644 modules/ingestor/src/main/scala/mongo.game.scala diff --git a/modules/ingestor/src/main/scala/cli.scala b/modules/ingestor/src/main/scala/cli.scala index c24fe20c..4f490c93 100644 --- a/modules/ingestor/src/main/scala/cli.scala +++ b/modules/ingestor/src/main/scala/cli.scala @@ -37,7 +37,8 @@ object cli team <- TeamIngestor(res.lichess, res.elastic, res.store, config.ingestor.team).toResource studies <- Studies(res.study, res.studyLocal, config.ingestor.study).toResource study = StudyIngestor(studies, res.elastic, res.store, config.ingestor.study) - game <- GameIngestor(res.lichess, res.elastic, res.store, config.ingestor.game).toResource + games <- Games(res.lichess, config.ingestor.game).toResource + game = GameIngestor(games, res.elastic, res.store, config.ingestor.game) yield Executor(forum, study, game, team) class Executor( diff --git a/modules/ingestor/src/main/scala/ingestor.game.scala b/modules/ingestor/src/main/scala/ingestor.game.scala index 2fb30e15..8df722d5 100644 --- a/modules/ingestor/src/main/scala/ingestor.game.scala +++ b/modules/ingestor/src/main/scala/ingestor.game.scala @@ -3,22 +3,10 @@ package ingestor import cats.effect.* import cats.syntax.all.* -import chess.Speed -import chess.variant.* -import com.mongodb.client.model.changestream.FullDocument -import com.mongodb.client.model.changestream.OperationType.* -import io.circe.* -import mongo4cats.circe.* -import mongo4cats.collection.MongoCollection -import mongo4cats.database.MongoDatabase -import mongo4cats.models.collection.ChangeStreamDocument -import mongo4cats.operations.{ Aggregate, Filter, Projection } -import org.bson.BsonTimestamp import org.typelevel.log4cats.syntax.* import org.typelevel.log4cats.{ Logger, LoggerFactory } import java.time.Instant -import scala.concurrent.duration.* trait GameIngestor: // watch change events from game5 collection and ingest games into elastic search @@ -32,259 +20,49 @@ object GameIngestor: private val index = Index.Game - private val interestedOperations = List(UPDATE, DELETE).map(_.getValue) - private val eventFilter = Filter.in("operationType", interestedOperations) - - private val interestedEventFields = - List( - "operationType", - "clusterTime", - "documentKey._id", - "fullDocument" - ) // TODO only include interestedFields - - private val eventProjection = Projection.include(interestedEventFields) - - // https://github.com/lichess-org/lila/blob/65e6dd88e99cfa0068bc790a4518a6edb3513f54/modules/gameSearch/src/main/GameSearchApi.scala#L52 - val gameFilter: Filter = - // Filter games that finished - // https://github.com/lichess-org/scalachess/blob/18edf46a50445048fdc2ee5a83752e5b3884f490/core/src/main/scala/Status.scala#L18-L27 - val statusFilter = Filter.gte("s", 30) - val noImportFilter = Filter.ne("so", 7) - // us fields is the list of player ids, if it's missing then it's - // an all anonymous (or anonymous vs stockfish) game - val noAllAnonFilter = Filter.exists("us") - statusFilter.and(noImportFilter).and(noAllAnonFilter) - - // https://github.com/lichess-org/lila/blob/65e6dd88e99cfa0068bc790a4518a6edb3513f54/modules/gameSearch/src/main/GameSearchApi.scala#L52 - val changeFilter: Filter = - // Filter games that finished - // https://github.com/lichess-org/scalachess/blob/18edf46a50445048fdc2ee5a83752e5b3884f490/core/src/main/scala/Status.scala#L18-L27 - val statusFilter = Filter.gte("fullDocument.s", 30) - val noImportFilter = Filter.ne("fullDocument.so", 7) - // us fields is the list of player ids, if it's missing then it's - // an all anonymous (or anonymous vs stockfish) game - val noAllAnonFilter = Filter.exists("fullDocument.us") - statusFilter.and(noImportFilter).and(noAllAnonFilter) - - private val aggregate = - Aggregate.matchBy(eventFilter.and(changeFilter)).combinedWith(Aggregate.project(eventProjection)) - - def apply(mongo: MongoDatabase[IO], elastic: ESClient[IO], store: KVStore, config: IngestorConfig.Game)( - using LoggerFactory[IO] - ): IO[GameIngestor] = - given Logger[IO] = summon[LoggerFactory[IO]].getLogger - mongo.getCollectionWithCodec[DbGame]("game5").map(apply(elastic, store, config)) - - def apply( - elastic: ESClient[IO], - store: KVStore, - config: IngestorConfig.Game - )(games: MongoCollection[IO, DbGame])(using Logger[IO]): GameIngestor = new: + def apply(games: Games, elastic: ESClient[IO], store: KVStore, config: IngestorConfig.Game)(using + LoggerFactory[IO] + ): GameIngestor = new: + given Logger[IO] = LoggerFactory[IO].getLogger def watch: fs2.Stream[IO, Unit] = fs2.Stream .eval(startAt.flatTap(since => info"Starting game ingestor from $since")) - .flatMap(watch(_, dryRun = false)) + .flatMap(games.watch(_)) + .evalMap(updateElastic(_, false)) def watch(since: Option[Instant], dryRun: Boolean): fs2.Stream[IO, Unit] = - changes(since) - .evalMap: events => - val lastEventTimestamp = events.lastOption.flatMap(_.clusterTime).flatMap(_.asInstant) - val (toDelete, toIndex) = events.partition(_.operationType == DELETE) - dryRun.fold( - info"Would index total ${toIndex.size} games and delete ${toDelete.size} games" *> - toIndex.flatMap(_.fullDocument).traverse_(x => debug"Would index ${x.debug}") - *> toDelete.traverse_(x => debug"Would delete ${x.docId}"), - storeBulk(toIndex.flatten(_.fullDocument)) - *> elastic.deleteMany(index, toDelete) - *> saveLastIndexedTimestamp(lastEventTimestamp.getOrElse(Instant.now)) - ) + games + .watch(since) + .evalMap(updateElastic(_, dryRun)) + + def updateElastic(result: Games.Result, dryRun: Boolean): IO[Unit] = + dryRun.fold( + info"Would index total ${result.toIndex.size} games and delete ${result.toDelete.size} games" *> + result.toIndex.traverse_(x => debug"Would index $x") + *> result.toDelete.traverse_(x => debug"Would delete $x"), + storeBulk(result.toIndex) + *> elastic.deleteMany(index, result.toDelete) + *> saveLastIndexedTimestamp(result.timestamp.getOrElse(Instant.now)) + ) def run(since: Instant, until: Instant, dryRun: Boolean): fs2.Stream[IO, Unit] = - val filter = range(F.createdAt)(since, until.some) - .or(range(F.updatedAt)(since, until.some)) games - .find(filter.and(gameFilter)) - // .projection(postProjection) - .boundedStream(config.batchSize) - .chunkN(config.batchSize) - .map(_.toList) - .metered(1.second) // to avoid overloading the elasticsearch - .evalMap: docs => - dryRun.fold( - info"Would index total ${docs.size} games" *> - docs.traverse_(doc => debug"Would index $doc"), - storeBulk(docs) - ) + .fetch(since, until) + .evalMap(updateElastic(_, dryRun)) - private def storeBulk(docs: List[DbGame]): IO[Unit] = - val sources = docs.map(_.toSource) - info"Received ${docs.size} ${index.value}s to index" *> + private def storeBulk(sources: List[(String, GameSource)]): IO[Unit] = + info"Received ${sources.size} ${index.value}s to index" *> elastic .storeBulk(index, sources) .handleErrorWith: e => - Logger[IO].error(e)(s"Failed to index ${index.value}s: ${docs.map(_.id).mkString(", ")}") + Logger[IO].error(e)(s"Failed to index ${index.value}s: ${sources.map(_._1).mkString(", ")}") .whenA(sources.nonEmpty) *> info"Indexed ${sources.size} ${index.value}s" - private def changes(since: Option[Instant]): fs2.Stream[IO, List[ChangeStreamDocument[DbGame]]] = - val builder = games.watch(aggregate) - since - .fold(builder)(x => builder.startAtOperationTime(x.asBsonTimestamp)) - .fullDocument(FullDocument.UPDATE_LOOKUP) // this is required for update event - .batchSize(config.batchSize) - .boundedStream(config.batchSize) - .groupWithin(config.batchSize, config.timeWindows.second) - .evalTap( - _.traverse_(x => - info"Received $x without p0 or p1 fields".whenA(x.fullDocument.exists(_.shouldDebug)) - ) - ) - .map(_.toList.distincByDocId) - .evalTap(_.traverse_(x => x.fullDocument.traverse_(x => debug"${x.debug}"))) - private def saveLastIndexedTimestamp(time: Instant): IO[Unit] = store.put(index.value, time) *> info"Stored last indexed time ${time.getEpochSecond} for $index" private def startAt: IO[Option[Instant]] = config.startAt.fold(store.get(index.value))(_.some.pure[IO]) - - object F: - val createdAt = "ca" - val updatedAt = "ua" - -type PlayerId = String -case class DbGame( - id: String, // _id - players: List[PlayerId], // us - winnerId: Option[PlayerId], // wid - createdAt: Instant, // ca - movedAt: Instant, // ua - ply: Int, // t - analysed: Option[Boolean], // an - whitePlayer: Option[DbPlayer], // p0 - blackPlayer: Option[DbPlayer], // p1 - playerIds: String, // is - binaryPieces: Option[Array[Byte]], // ps - huffmanPgn: Option[Array[Byte]], // hp - status: Int, // s - encodedClock: Option[Array[Byte]], // c - moveTimes: Option[Array[Byte]], // mt - encodedWhiteClock: Option[Array[Byte]], // cw - encodedBlackClock: Option[Array[Byte]], // cb - rated: Option[Boolean], // ra - variant: Option[Int], // v - source: Option[Int], // so - winnerColor: Option[Boolean] // w -): - def clockConfig = encodedClock.flatMap(ClockDecoder.read).map(_.white) - def clockInit = clockConfig.map(_.limitSeconds.value) - def clockInc = clockConfig.map(_.incrementSeconds.value) - def whiteId = players.headOption - def blackId = players.lift(1) - def variantOrDefault = Variant.idOrDefault(variant.map(Variant.Id.apply)) - def speed = Speed(clockConfig) - def loser = players.find(_.some != winnerId) - def aiLevel = whitePlayer.flatMap(_.aiLevel).orElse(blackPlayer.flatMap(_.aiLevel)) - - // https://github.com/lichess-org/lila/blob/65e6dd88e99cfa0068bc790a4518a6edb3513f54/modules/core/src/main/game/Game.scala#L261 - private def averageUsersRating = - List(whitePlayer.flatMap(_.rating), blackPlayer.flatMap(_.rating)).flatten match - case a :: b :: Nil => Some((a + b) / 2) - case a :: Nil => Some((a + 1500) / 2) - case _ => None - - // https://github.com/lichess-org/lila/blob/02ac57c4584b89a0df8f343f34074c0135c2d2b4/modules/core/src/main/game/Game.scala#L90-L97 - def durationSeconds: Option[Int] = - val seconds = (movedAt.toEpochMilli / 1000 - createdAt.toEpochMilli / 1000) - Option.when(seconds < 60 * 60 * 12)(seconds.toInt) - - def toSource = - id -> - GameSource( - status = status, - turns = (ply + 1) / 2, - rated = rated.getOrElse(false), - perf = DbGame.perfId(variantOrDefault, speed), - winnerColor = winnerColor.fold(3)(if _ then 1 else 2), - date = SearchDateTime.fromInstant(movedAt), - analysed = analysed.getOrElse(false), - uids = players.some, // make usid not optional - winner = winnerId, - loser = loser, - averageRating = averageUsersRating, - ai = aiLevel, - duration = durationSeconds, - clockInit = clockInit, - clockInc = clockInc, - whiteUser = whiteId, - blackUser = blackId, - source = source - ) - - def shouldDebug = - whitePlayer.isEmpty || blackPlayer.isEmpty - - def debug = - import smithy4s.json.Json.given - import com.github.plokhotnyuk.jsoniter_scala.core.* - id -> writeToString(toSource._2) - -object DbGame: - // format: off - given Decoder[DbGame] = Decoder.forProduct21( - "_id", "us", "wid", "ca", "ua", "t", "an", "p0", "p1", "is", "ps", - "hp", "s", "c", "mt", "cw", "cb", "ra", "v", "so", "w")(DbGame.apply) - // format: on - - // We don't write to the database so We don't need to implement this - given Encoder[DbGame] = new Encoder[DbGame]: - def apply(a: DbGame): Json = ??? - - def perfId(variant: Variant, speed: Speed): Int = - variant.match - case Standard | FromPosition => - speed match - case Speed.UltraBullet => 0 - case Speed.Bullet => 1 - case Speed.Blitz => 2 - case Speed.Rapid => 6 - case Speed.Classical => 3 - case Speed.Correspondence => 4 - case Crazyhouse => 18 - case Chess960 => 11 - case KingOfTheHill => 12 - case ThreeCheck => 15 - case Antichess => 13 - case Atomic => 14 - case Horde => 16 - case RacingKings => 17 - -case class DbPlayer( - rating: Option[Int], - ratingDiff: Option[Int], - berserk: Option[Boolean], - aiLevel: Option[Int], - provisional: Option[Boolean], - name: Option[String] -) - -object DbPlayer: - given Decoder[DbPlayer] = Decoder.forProduct6("e", "d", "be", "ai", "p", "na")(DbPlayer.apply) - given Encoder[DbPlayer] = Encoder.forProduct6("e", "d", "be", "ai", "p", "na")(p => - (p.rating, p.ratingDiff, p.berserk, p.aiLevel, p.provisional, p.name) - ) - -object ClockDecoder: - import chess.* - private def readClockLimit(i: Int) = Clock.LimitSeconds(if i < 181 then i * 60 else (i - 180) * 15) - - private inline def toInt(inline b: Byte): Int = b & 0xff - - def read(ba: Array[Byte]): Option[ByColor[Clock.Config]] = - ByColor: color => - ba.take(2).map(toInt) match - case Array(b1, b2) => Clock.Config(readClockLimit(b1), Clock.IncrementSeconds(b2)).some - case _ => None diff --git a/modules/ingestor/src/main/scala/ingestor.scala b/modules/ingestor/src/main/scala/ingestor.scala index 0aa13fc5..e35ad4ab 100644 --- a/modules/ingestor/src/main/scala/ingestor.scala +++ b/modules/ingestor/src/main/scala/ingestor.scala @@ -22,11 +22,12 @@ object Ingestor: ( Forums(lichess, config.forum), Studies(study, local, config.study), - TeamIngestor(lichess, elastic, store, config.team), - GameIngestor(lichess, elastic, store, config.game) - ).mapN: (forums, studies, team, game) => + Games(lichess, config.game), + TeamIngestor(lichess, elastic, store, config.team) + ).mapN: (forums, studies, games, team) => val forum = ForumIngestor(elastic, store, config.forum, forums) val study = StudyIngestor(studies, elastic, store, config.study) + val game = GameIngestor(games, elastic, store, config.game) new Ingestor: def run() = fs2 diff --git a/modules/ingestor/src/main/scala/mongo.game.scala b/modules/ingestor/src/main/scala/mongo.game.scala new file mode 100644 index 00000000..9e33313f --- /dev/null +++ b/modules/ingestor/src/main/scala/mongo.game.scala @@ -0,0 +1,253 @@ +package lila.search +package ingestor + +import cats.effect.* +import cats.syntax.all.* +import chess.Speed +import chess.variant.* +import com.mongodb.client.model.changestream.FullDocument +import com.mongodb.client.model.changestream.OperationType.* +import io.circe.* +import mongo4cats.circe.* +import mongo4cats.collection.MongoCollection +import mongo4cats.database.MongoDatabase +import mongo4cats.models.collection.ChangeStreamDocument +import mongo4cats.operations.{ Aggregate, Filter, Projection } +import org.bson.BsonTimestamp +import org.typelevel.log4cats.syntax.* +import org.typelevel.log4cats.{ Logger, LoggerFactory } + +import java.time.Instant +import scala.concurrent.duration.* + +import Games.Result + +trait Games: + def watch(since: Option[Instant]): fs2.Stream[IO, Result] + def fetch(since: Instant, until: Instant): fs2.Stream[IO, Result] + +object Games: + + private type SourceWithId = (String, GameSource) + case class Result(toIndex: List[SourceWithId], toDelete: List[Id], timestamp: Option[Instant]) + + private val interestedOperations = List(UPDATE, DELETE).map(_.getValue) + private val eventFilter = Filter.in("operationType", interestedOperations) + + private val interestedEventFields = + List( + "operationType", + "clusterTime", + "documentKey._id", + "fullDocument" + ) // TODO only include interestedFields + + private val eventProjection = Projection.include(interestedEventFields) + + // https://github.com/lichess-org/lila/blob/65e6dd88e99cfa0068bc790a4518a6edb3513f54/modules/gameSearch/src/main/GameSearchApi.scala#L52 + val gameFilter: Filter = + // Filter games that finished + // https://github.com/lichess-org/scalachess/blob/18edf46a50445048fdc2ee5a83752e5b3884f490/core/src/main/scala/Status.scala#L18-L27 + val statusFilter = Filter.gte("s", 30) + val noImportFilter = Filter.ne("so", 7) + // us fields is the list of player ids, if it's missing then it's + // an all anonymous (or anonymous vs stockfish) game + val noAllAnonFilter = Filter.exists("us") + statusFilter.and(noImportFilter).and(noAllAnonFilter) + + // https://github.com/lichess-org/lila/blob/65e6dd88e99cfa0068bc790a4518a6edb3513f54/modules/gameSearch/src/main/GameSearchApi.scala#L52 + val changeFilter: Filter = + // Filter games that finished + // https://github.com/lichess-org/scalachess/blob/18edf46a50445048fdc2ee5a83752e5b3884f490/core/src/main/scala/Status.scala#L18-L27 + val statusFilter = Filter.gte("fullDocument.s", 30) + val noImportFilter = Filter.ne("fullDocument.so", 7) + // us fields is the list of player ids, if it's missing then it's + // an all anonymous (or anonymous vs stockfish) game + val noAllAnonFilter = Filter.exists("fullDocument.us") + statusFilter.and(noImportFilter).and(noAllAnonFilter) + + private val aggregate = + Aggregate.matchBy(eventFilter.and(changeFilter)).combinedWith(Aggregate.project(eventProjection)) + + def apply(mongo: MongoDatabase[IO], config: IngestorConfig.Game)(using LoggerFactory[IO]): IO[Games] = + given Logger[IO] = summon[LoggerFactory[IO]].getLogger + mongo.getCollectionWithCodec[DbGame]("game5").map(apply(config)) + + def apply(config: IngestorConfig.Game)(games: MongoCollection[IO, DbGame])(using Logger[IO]): Games = new: + + def watch(since: Option[Instant]): fs2.Stream[IO, Result] = + changes(since) + .map: events => + val lastEventTimestamp = events.lastOption.flatMap(_.clusterTime).flatMap(_.asInstant) + val (toDelete, toIndex) = events.partition(_.operationType == DELETE) + Result( + toIndex.flatten(_.fullDocument.map(_.toSource)), + toDelete.flatten(_.docId.map(Id.apply)), + lastEventTimestamp + ) + + def fetch(since: Instant, until: Instant): fs2.Stream[IO, Result] = + val filter = range(F.createdAt)(since, until.some) + .or(range(F.updatedAt)(since, until.some)) + games + .find(filter.and(gameFilter)) + // .projection(postProjection) + .boundedStream(config.batchSize) + .chunkN(config.batchSize) + .map(_.toList) + .metered(1.second) // to avoid overloading the elasticsearch + .map(ds => Result(ds.map(_.toSource), Nil, none)) + + private def changes(since: Option[Instant]): fs2.Stream[IO, List[ChangeStreamDocument[DbGame]]] = + val builder = games.watch(aggregate) + since + .fold(builder)(x => builder.startAtOperationTime(x.asBsonTimestamp)) + .fullDocument(FullDocument.UPDATE_LOOKUP) // this is required for update event + .batchSize(config.batchSize) + .boundedStream(config.batchSize) + .groupWithin(config.batchSize, config.timeWindows.second) + .evalTap( + _.traverse_(x => + info"Received $x without p0 or p1 fields".whenA(x.fullDocument.exists(_.shouldDebug)) + ) + ) + .map(_.toList.distincByDocId) + .evalTap(_.traverse_(x => x.fullDocument.traverse_(x => debug"${x.debug}"))) + + object F: + val createdAt = "ca" + val updatedAt = "ua" + +type PlayerId = String +case class DbGame( + id: String, // _id + players: List[PlayerId], // us + winnerId: Option[PlayerId], // wid + createdAt: Instant, // ca + movedAt: Instant, // ua + ply: Int, // t + analysed: Option[Boolean], // an + whitePlayer: Option[DbPlayer], // p0 + blackPlayer: Option[DbPlayer], // p1 + playerIds: String, // is + binaryPieces: Option[Array[Byte]], // ps + huffmanPgn: Option[Array[Byte]], // hp + status: Int, // s + encodedClock: Option[Array[Byte]], // c + moveTimes: Option[Array[Byte]], // mt + encodedWhiteClock: Option[Array[Byte]], // cw + encodedBlackClock: Option[Array[Byte]], // cb + rated: Option[Boolean], // ra + variant: Option[Int], // v + source: Option[Int], // so + winnerColor: Option[Boolean] // w +): + def clockConfig = encodedClock.flatMap(ClockDecoder.read).map(_.white) + def clockInit = clockConfig.map(_.limitSeconds.value) + def clockInc = clockConfig.map(_.incrementSeconds.value) + def whiteId = players.headOption + def blackId = players.lift(1) + def variantOrDefault = Variant.idOrDefault(variant.map(Variant.Id.apply)) + def speed = Speed(clockConfig) + def loser = players.find(_.some != winnerId) + def aiLevel = whitePlayer.flatMap(_.aiLevel).orElse(blackPlayer.flatMap(_.aiLevel)) + + // https://github.com/lichess-org/lila/blob/65e6dd88e99cfa0068bc790a4518a6edb3513f54/modules/core/src/main/game/Game.scala#L261 + private def averageUsersRating = + List(whitePlayer.flatMap(_.rating), blackPlayer.flatMap(_.rating)).flatten match + case a :: b :: Nil => Some((a + b) / 2) + case a :: Nil => Some((a + 1500) / 2) + case _ => None + + // https://github.com/lichess-org/lila/blob/02ac57c4584b89a0df8f343f34074c0135c2d2b4/modules/core/src/main/game/Game.scala#L90-L97 + def durationSeconds: Option[Int] = + val seconds = (movedAt.toEpochMilli / 1000 - createdAt.toEpochMilli / 1000) + Option.when(seconds < 60 * 60 * 12)(seconds.toInt) + + def toSource = + id -> + GameSource( + status = status, + turns = (ply + 1) / 2, + rated = rated.getOrElse(false), + perf = DbGame.perfId(variantOrDefault, speed), + winnerColor = winnerColor.fold(3)(if _ then 1 else 2), + date = SearchDateTime.fromInstant(movedAt), + analysed = analysed.getOrElse(false), + uids = players.some, // make usid not optional + winner = winnerId, + loser = loser, + averageRating = averageUsersRating, + ai = aiLevel, + duration = durationSeconds, + clockInit = clockInit, + clockInc = clockInc, + whiteUser = whiteId, + blackUser = blackId, + source = source + ) + + def shouldDebug = + whitePlayer.isEmpty || blackPlayer.isEmpty + + def debug = + import smithy4s.json.Json.given + import com.github.plokhotnyuk.jsoniter_scala.core.* + id -> writeToString(toSource._2) + +object DbGame: + // format: off + given Decoder[DbGame] = Decoder.forProduct21( + "_id", "us", "wid", "ca", "ua", "t", "an", "p0", "p1", "is", "ps", + "hp", "s", "c", "mt", "cw", "cb", "ra", "v", "so", "w")(DbGame.apply) + // format: on + + // We don't write to the database so We don't need to implement this + given Encoder[DbGame] = new Encoder[DbGame]: + def apply(a: DbGame): Json = ??? + + def perfId(variant: Variant, speed: Speed): Int = + variant.match + case Standard | FromPosition => + speed match + case Speed.UltraBullet => 0 + case Speed.Bullet => 1 + case Speed.Blitz => 2 + case Speed.Rapid => 6 + case Speed.Classical => 3 + case Speed.Correspondence => 4 + case Crazyhouse => 18 + case Chess960 => 11 + case KingOfTheHill => 12 + case ThreeCheck => 15 + case Antichess => 13 + case Atomic => 14 + case Horde => 16 + case RacingKings => 17 + +case class DbPlayer( + rating: Option[Int], + ratingDiff: Option[Int], + berserk: Option[Boolean], + aiLevel: Option[Int], + provisional: Option[Boolean], + name: Option[String] +) + +object DbPlayer: + given Decoder[DbPlayer] = Decoder.forProduct6("e", "d", "be", "ai", "p", "na")(DbPlayer.apply) + given Encoder[DbPlayer] = Encoder.forProduct6("e", "d", "be", "ai", "p", "na")(p => + (p.rating, p.ratingDiff, p.berserk, p.aiLevel, p.provisional, p.name) + ) + +object ClockDecoder: + import chess.* + private def readClockLimit(i: Int) = Clock.LimitSeconds(if i < 181 then i * 60 else (i - 180) * 15) + + private inline def toInt(inline b: Byte): Int = b & 0xff + + def read(ba: Array[Byte]): Option[ByColor[Clock.Config]] = + ByColor: color => + ba.take(2).map(toInt) match + case Array(b1, b2) => Clock.Config(readClockLimit(b1), Clock.IncrementSeconds(b2)).some + case _ => None From b3dfd211f6034132e606fa7430c67176a0d4dfb3 Mon Sep 17 00:00:00 2001 From: Thanh Le Date: Wed, 27 Nov 2024 16:24:36 +0100 Subject: [PATCH 04/18] Refactor team mongo --- modules/ingestor/src/main/scala/cli.scala | 5 +- .../src/main/scala/ingestor.forum.scala | 2 +- .../ingestor/src/main/scala/ingestor.scala | 7 +- .../src/main/scala/ingestor.team.scala | 114 +++-------------- .../ingestor/src/main/scala/mongo.team.scala | 117 ++++++++++++++++++ 5 files changed, 142 insertions(+), 103 deletions(-) create mode 100644 modules/ingestor/src/main/scala/mongo.team.scala diff --git a/modules/ingestor/src/main/scala/cli.scala b/modules/ingestor/src/main/scala/cli.scala index 4f490c93..257dba71 100644 --- a/modules/ingestor/src/main/scala/cli.scala +++ b/modules/ingestor/src/main/scala/cli.scala @@ -33,12 +33,13 @@ object cli config <- AppConfig.load.toResource res <- AppResources.instance(config) forums <- Forums(res.lichess, config.ingestor.forum).toResource - forum = ForumIngestor(res.elastic, res.store, config.ingestor.forum, forums) - team <- TeamIngestor(res.lichess, res.elastic, res.store, config.ingestor.team).toResource + forum = ForumIngestor(forums, res.elastic, res.store, config.ingestor.forum) studies <- Studies(res.study, res.studyLocal, config.ingestor.study).toResource study = StudyIngestor(studies, res.elastic, res.store, config.ingestor.study) games <- Games(res.lichess, config.ingestor.game).toResource game = GameIngestor(games, res.elastic, res.store, config.ingestor.game) + teams <- Teams(res.lichess, config.ingestor.team).toResource + team = TeamIngestor(teams, res.elastic, res.store, config.ingestor.team) yield Executor(forum, study, game, team) class Executor( diff --git a/modules/ingestor/src/main/scala/ingestor.forum.scala b/modules/ingestor/src/main/scala/ingestor.forum.scala index 9941c7f0..1090afa9 100644 --- a/modules/ingestor/src/main/scala/ingestor.forum.scala +++ b/modules/ingestor/src/main/scala/ingestor.forum.scala @@ -18,7 +18,7 @@ object ForumIngestor: private val index = Index.Forum - def apply(elastic: ESClient[IO], store: KVStore, config: IngestorConfig.Forum, forums: Forums)(using + def apply(forums: Forums, elastic: ESClient[IO], store: KVStore, config: IngestorConfig.Forum)(using LoggerFactory[IO] ): ForumIngestor = new: diff --git a/modules/ingestor/src/main/scala/ingestor.scala b/modules/ingestor/src/main/scala/ingestor.scala index e35ad4ab..529b60a6 100644 --- a/modules/ingestor/src/main/scala/ingestor.scala +++ b/modules/ingestor/src/main/scala/ingestor.scala @@ -23,11 +23,12 @@ object Ingestor: Forums(lichess, config.forum), Studies(study, local, config.study), Games(lichess, config.game), - TeamIngestor(lichess, elastic, store, config.team) - ).mapN: (forums, studies, games, team) => - val forum = ForumIngestor(elastic, store, config.forum, forums) + Teams(lichess, config.team) + ).mapN: (forums, studies, games, teams) => + val forum = ForumIngestor(forums, elastic, store, config.forum) val study = StudyIngestor(studies, elastic, store, config.study) val game = GameIngestor(games, elastic, store, config.game) + val team = TeamIngestor(teams, elastic, store, config.team) new Ingestor: def run() = fs2 diff --git a/modules/ingestor/src/main/scala/ingestor.team.scala b/modules/ingestor/src/main/scala/ingestor.team.scala index 2f8acc8f..4038836e 100644 --- a/modules/ingestor/src/main/scala/ingestor.team.scala +++ b/modules/ingestor/src/main/scala/ingestor.team.scala @@ -3,17 +3,10 @@ package ingestor import cats.effect.IO import cats.syntax.all.* -import com.mongodb.client.model.changestream.FullDocument -import com.mongodb.client.model.changestream.OperationType.* -import mongo4cats.bson.Document -import mongo4cats.database.MongoDatabase -import mongo4cats.models.collection.ChangeStreamDocument -import mongo4cats.operations.{ Aggregate, Filter, Projection } import org.typelevel.log4cats.syntax.* import org.typelevel.log4cats.{ Logger, LoggerFactory } import java.time.Instant -import scala.concurrent.duration.* trait TeamIngestor: // watch change events from MongoDB and ingest team data into elastic search @@ -25,66 +18,37 @@ object TeamIngestor: private val index = Index.Team - private val interestedOperations = List(DELETE, INSERT, UPDATE, REPLACE).map(_.getValue) - private val eventFilter = Filter.in("operationType", interestedOperations) - - private val interestedFields = List("_id", F.name, F.description, F.nbMembers, F.name, F.enabled) - private val postProjection = Projection.include(interestedFields) - - private val interestedEventFields = - List("operationType", "clusterTime", "documentKey._id") ++ interestedFields.map("fullDocument." + _) - private val eventProjection = Projection.include(interestedEventFields) - - private val aggregate = Aggregate.matchBy(eventFilter).combinedWith(Aggregate.project(eventProjection)) - - def apply(mongo: MongoDatabase[IO], elastic: ESClient[IO], store: KVStore, config: IngestorConfig.Team)( - using LoggerFactory[IO] - ): IO[TeamIngestor] = - given Logger[IO] = summon[LoggerFactory[IO]].getLogger - mongo.getCollection("team").map(apply(elastic, store, config)) - - def apply(elastic: ESClient[IO], store: KVStore, config: IngestorConfig.Team)(teams: MongoCollection)(using - Logger[IO] + def apply(teams: Teams, elastic: ESClient[IO], store: KVStore, config: IngestorConfig.Team)(using + LoggerFactory[IO] ): TeamIngestor = new: + given Logger[IO] = summon[LoggerFactory[IO]].getLogger def watch = fs2.Stream .eval(startAt.flatTap(since => info"Starting team ingestor from $since")) .flatMap: last => - changeStream(last) - .filterNot(_.isEmpty) - .evalMap: events => - val lastEventTimestamp = events.lastOption.flatMap(_.clusterTime).flatMap(_.asInstant) - val (toDelete, toIndex) = events.partition(_.isDelete) - storeBulk(toIndex.flatten(_.fullDocument)) - *> elastic.deleteMany(index, toDelete) - *> saveLastIndexedTimestamp(lastEventTimestamp.getOrElse(Instant.now)) + teams + .watch(last) + .evalMap: result => + storeBulk(result.toIndex) + *> elastic.deleteMany(index, result.toDelete) + *> saveLastIndexedTimestamp(result.timestamp.getOrElse(Instant.now)) def run(since: Instant, until: Instant, dryRun: Boolean) = - val filter = range(F.createdAt)(since, until.some) - .or(range(F.updatedAt)(since, until.some)) - .or(range(F.erasedAt)(since, until.some)) teams - .find(filter) - .projection(postProjection) - .boundedStream(config.batchSize) - .chunkN(config.batchSize) - .map(_.toList) - .metered(1.second) // to avoid overloading the elasticsearch - .evalMap: docs => - val (toDelete, toIndex) = docs.partition(!_.isEnabled) + .fetch(since, until) + .evalMap: result => dryRun.fold( - toIndex.traverse_(doc => debug"Would index $doc") - *> toDelete.traverse_(doc => debug"Would delete $doc"), - storeBulk(toIndex) *> elastic.deleteMany(index, toDelete) + result.toIndex.traverse_(doc => debug"Would index $doc") + *> result.toDelete.traverse_(doc => debug"Would delete $doc"), + storeBulk(result.toIndex) *> elastic.deleteMany(index, result.toDelete) ) - private def storeBulk(docs: List[Document]): IO[Unit] = - val sources = docs.toSources - info"Received ${docs.size} teams to index" *> + private def storeBulk(sources: List[Teams.SourceWithId]): IO[Unit] = + info"Received ${sources.size} teams to index" *> elastic .storeBulk(index, sources) .handleErrorWith: e => - Logger[IO].error(e)(s"Failed to index teams: ${docs.map(_.id).mkString(", ")}") + Logger[IO].error(e)(s"Failed to index teams: ${sources.map(_._1).mkString(", ")}") .whenA(sources.nonEmpty) *> info"Indexed ${sources.size} teams" @@ -94,47 +58,3 @@ object TeamIngestor: private def startAt: IO[Option[Instant]] = config.startAt.fold(store.get(index.value))(_.some.pure[IO]) - - private def changeStream(since: Option[Instant]): fs2.Stream[IO, List[ChangeStreamDocument[Document]]] = - // skip the first event if we're starting from a specific timestamp - // since the event at that timestamp is already indexed - val skip = since.fold(0)(_ => 1) - val builder = teams.watch(aggregate) - since - .fold(builder)(x => builder.startAtOperationTime(x.asBsonTimestamp)) - .batchSize(config.batchSize) - .fullDocument(FullDocument.UPDATE_LOOKUP) // this is required for update event - .boundedStream(config.batchSize) - .drop(skip) - .evalTap(x => debug"Team change stream event: $x") - .groupWithin(config.batchSize, config.timeWindows.second) - .map(_.toList.distincByDocId) - - extension (docs: List[Document]) - private def toSources: List[(String, TeamSource)] = - docs.flatten(doc => (doc.id, doc.toSource).mapN(_ -> _)) - - extension (doc: Document) - private def toSource: Option[TeamSource] = - ( - doc.getString(F.name), - doc.getString(F.description), - doc.getInt(F.nbMembers) - ).mapN(TeamSource.apply) - - private def isEnabled = - doc.getBoolean(F.enabled).getOrElse(true) - - extension (event: ChangeStreamDocument[Document]) - private def isDelete: Boolean = - event.operationType == DELETE || - event.fullDocument.fold(false)(x => !x.isEnabled) - - object F: - val name = "name" - val description = "description" - val nbMembers = "nbMembers" - val enabled = "enabled" - val createdAt = "createdAt" - val updatedAt = "updatedAt" - val erasedAt = "erasedAt" diff --git a/modules/ingestor/src/main/scala/mongo.team.scala b/modules/ingestor/src/main/scala/mongo.team.scala new file mode 100644 index 00000000..fc8d59cb --- /dev/null +++ b/modules/ingestor/src/main/scala/mongo.team.scala @@ -0,0 +1,117 @@ +package lila.search +package ingestor + +import cats.effect.IO +import cats.syntax.all.* +import com.mongodb.client.model.changestream.FullDocument +import com.mongodb.client.model.changestream.OperationType.* +import mongo4cats.bson.Document +import mongo4cats.database.MongoDatabase +import mongo4cats.models.collection.ChangeStreamDocument +import mongo4cats.operations.{ Aggregate, Filter, Projection } +import org.typelevel.log4cats.syntax.* +import org.typelevel.log4cats.{ Logger, LoggerFactory } + +import java.time.Instant +import scala.concurrent.duration.* + +import Teams.Result +trait Teams: + def watch(since: Option[Instant]): fs2.Stream[IO, Result] + def fetch(since: Instant, until: Instant): fs2.Stream[IO, Result] + +object Teams: + + type SourceWithId = (String, TeamSource) + case class Result(toIndex: List[SourceWithId], toDelete: List[Id], timestamp: Option[Instant]) + + private val interestedOperations = List(DELETE, INSERT, UPDATE, REPLACE).map(_.getValue) + private val eventFilter = Filter.in("operationType", interestedOperations) + + private val interestedFields = List("_id", F.name, F.description, F.nbMembers, F.name, F.enabled) + private val postProjection = Projection.include(interestedFields) + + private val interestedEventFields = + List("operationType", "clusterTime", "documentKey._id") ++ interestedFields.map("fullDocument." + _) + private val eventProjection = Projection.include(interestedEventFields) + + private val aggregate = Aggregate.matchBy(eventFilter).combinedWith(Aggregate.project(eventProjection)) + + def apply(mongo: MongoDatabase[IO], config: IngestorConfig.Team)(using + LoggerFactory[IO] + ): IO[Teams] = + given Logger[IO] = summon[LoggerFactory[IO]].getLogger + mongo.getCollection("team").map(apply(config)) + + def apply(config: IngestorConfig.Team)(teams: MongoCollection)(using Logger[IO]): Teams = new: + + def watch(since: Option[Instant]) = + // skip the first event if we're starting from a specific timestamp + // since the event at that timestamp is already indexed + val skip = since.fold(0)(_ => 1) + val builder = teams.watch(aggregate) + since + .fold(builder)(x => builder.startAtOperationTime(x.asBsonTimestamp)) + .batchSize(config.batchSize) + .fullDocument(FullDocument.UPDATE_LOOKUP) // this is required for update event + .boundedStream(config.batchSize) + .drop(skip) + .evalTap(x => debug"Team change stream event: $x") + .groupWithin(config.batchSize, config.timeWindows.second) + .map(_.toList.distincByDocId) + .map: docs => + val lastEventTimestamp = docs.lastOption.flatMap(_.clusterTime).flatMap(_.asInstant) + val (toDelete, toIndex) = docs.partition(_.isDelete) + Result( + toIndex.flatten(_.fullDocument).toSources, + toDelete.flatten(_.docId.map(Id.apply)), + lastEventTimestamp + ) + + def fetch(since: Instant, until: Instant) = + val filter = range(F.createdAt)(since, until.some) + .or(range(F.updatedAt)(since, until.some)) + .or(range(F.erasedAt)(since, until.some)) + teams + .find(filter) + .projection(postProjection) + .boundedStream(config.batchSize) + .chunkN(config.batchSize) + .map(_.toList) + .metered(1.second) // to avoid overloading the elasticsearch + .map: docs => + val (toDelete, toIndex) = docs.partition(!_.isEnabled) + Result( + toIndex.toSources, + toDelete.flatten(_.id.map(Id.apply)), + none + ) + + extension (docs: List[Document]) + private def toSources: List[(String, TeamSource)] = + docs.flatten(doc => (doc.id, doc.toSource).mapN(_ -> _)) + + extension (doc: Document) + private def toSource: Option[TeamSource] = + ( + doc.getString(F.name), + doc.getString(F.description), + doc.getInt(F.nbMembers) + ).mapN(TeamSource.apply) + + private def isEnabled = + doc.getBoolean(F.enabled).getOrElse(true) + + extension (event: ChangeStreamDocument[Document]) + private def isDelete: Boolean = + event.operationType == DELETE || + event.fullDocument.fold(false)(x => !x.isEnabled) + + object F: + val name = "name" + val description = "description" + val nbMembers = "nbMembers" + val enabled = "enabled" + val createdAt = "createdAt" + val updatedAt = "updatedAt" + val erasedAt = "erasedAt" From e4f85d7a1bddd4b0fdccd9e420dd4ced607ced0b Mon Sep 17 00:00:00 2001 From: Thanh Le Date: Wed, 27 Nov 2024 16:37:38 +0100 Subject: [PATCH 05/18] Make ingestor return IO --- modules/ingestor/src/main/scala/cli.scala | 18 ++++++------ .../src/main/scala/ingestor.forum.scala | 12 +++++--- .../src/main/scala/ingestor.game.scala | 28 +++++++++++-------- .../ingestor/src/main/scala/ingestor.scala | 9 ++---- .../src/main/scala/ingestor.study.scala | 12 +++++--- .../src/main/scala/ingestor.team.scala | 8 ++++-- 6 files changed, 50 insertions(+), 37 deletions(-) diff --git a/modules/ingestor/src/main/scala/cli.scala b/modules/ingestor/src/main/scala/cli.scala index 257dba71..492d6179 100644 --- a/modules/ingestor/src/main/scala/cli.scala +++ b/modules/ingestor/src/main/scala/cli.scala @@ -56,23 +56,23 @@ object cli def index(opts: IndexOpts): IO[Unit] = opts.index match case Index.Forum => - forum.run(opts.since, opts.until, opts.dry).compile.drain + forum.run(opts.since, opts.until, opts.dry) case Index.Study => - study.run(opts.since, opts.until, opts.dry).compile.drain + study.run(opts.since, opts.until, opts.dry) case Index.Game => - game.run(opts.since, opts.until, opts.dry).compile.drain + game.run(opts.since, opts.until, opts.dry) case Index.Team => - team.run(opts.since, opts.until, opts.dry).compile.drain + team.run(opts.since, opts.until, opts.dry) case _ => - forum.run(opts.since, opts.until, opts.dry).compile.drain *> - study.run(opts.since, opts.until, opts.dry).compile.drain *> - game.run(opts.since, opts.until, opts.dry).compile.drain *> - team.run(opts.since, opts.until, opts.dry).compile.drain + forum.run(opts.since, opts.until, opts.dry) *> + study.run(opts.since, opts.until, opts.dry) *> + game.run(opts.since, opts.until, opts.dry) *> + team.run(opts.since, opts.until, opts.dry) def watch(opts: WatchOpts): IO[Unit] = opts.index match case Index.Game => - game.watch(opts.since.some, opts.dry).compile.drain + game.watch(opts.since.some, opts.dry) case _ => IO.println("We only support game watch for now") object opts: diff --git a/modules/ingestor/src/main/scala/ingestor.forum.scala b/modules/ingestor/src/main/scala/ingestor.forum.scala index 1090afa9..db2fa7fd 100644 --- a/modules/ingestor/src/main/scala/ingestor.forum.scala +++ b/modules/ingestor/src/main/scala/ingestor.forum.scala @@ -10,9 +10,9 @@ import java.time.Instant trait ForumIngestor: // watch change events from MongoDB and ingest forum posts into elastic search - def watch: fs2.Stream[IO, Unit] + def watch: IO[Unit] // Fetch posts in [since, until] and ingest into elastic search - def run(since: Instant, until: Instant, dryRun: Boolean): fs2.Stream[IO, Unit] + def run(since: Instant, until: Instant, dryRun: Boolean): IO[Unit] object ForumIngestor: @@ -24,7 +24,7 @@ object ForumIngestor: given Logger[IO] = LoggerFactory[IO].getLogger - def watch: fs2.Stream[IO, Unit] = + def watch: IO[Unit] = fs2.Stream .eval(startAt.flatTap(since => info"Starting forum ingestor from $since")) .flatMap: last => @@ -34,8 +34,10 @@ object ForumIngestor: storeBulk(result.toIndex) *> elastic.deleteMany(index, result.toDelete) *> saveLastIndexedTimestamp(result.timestamp.getOrElse(Instant.now())) + .compile + .drain - def run(since: Instant, until: Instant, dryRun: Boolean): fs2.Stream[IO, Unit] = + def run(since: Instant, until: Instant, dryRun: Boolean): IO[Unit] = forums .fetch(since, until) .evalMap: result => @@ -44,6 +46,8 @@ object ForumIngestor: *> result.toDelete.traverse_(doc => debug"Would delete $doc"), storeBulk(result.toIndex) *> elastic.deleteMany(index, result.toDelete) ) + .compile + .drain private def storeBulk(docs: List[(String, ForumSource)]): IO[Unit] = info"Received ${docs.size} forum posts to index" *> diff --git a/modules/ingestor/src/main/scala/ingestor.game.scala b/modules/ingestor/src/main/scala/ingestor.game.scala index 8df722d5..8037a4d9 100644 --- a/modules/ingestor/src/main/scala/ingestor.game.scala +++ b/modules/ingestor/src/main/scala/ingestor.game.scala @@ -10,11 +10,11 @@ import java.time.Instant trait GameIngestor: // watch change events from game5 collection and ingest games into elastic search - def watch: fs2.Stream[IO, Unit] + def watch: IO[Unit] // Similar to watch but started from a given timestamp - def watch(since: Option[Instant], dryRun: Boolean): fs2.Stream[IO, Unit] + def watch(since: Option[Instant], dryRun: Boolean): IO[Unit] // Fetch posts in [since, until] and ingest into elastic search - def run(since: Instant, until: Instant, dryRun: Boolean): fs2.Stream[IO, Unit] + def run(since: Instant, until: Instant, dryRun: Boolean): IO[Unit] object GameIngestor: @@ -25,18 +25,29 @@ object GameIngestor: ): GameIngestor = new: given Logger[IO] = LoggerFactory[IO].getLogger - def watch: fs2.Stream[IO, Unit] = + def watch: IO[Unit] = fs2.Stream .eval(startAt.flatTap(since => info"Starting game ingestor from $since")) .flatMap(games.watch(_)) .evalMap(updateElastic(_, false)) + .compile + .drain - def watch(since: Option[Instant], dryRun: Boolean): fs2.Stream[IO, Unit] = + def watch(since: Option[Instant], dryRun: Boolean): IO[Unit] = games .watch(since) .evalMap(updateElastic(_, dryRun)) + .compile + .drain - def updateElastic(result: Games.Result, dryRun: Boolean): IO[Unit] = + def run(since: Instant, until: Instant, dryRun: Boolean): IO[Unit] = + games + .fetch(since, until) + .evalMap(updateElastic(_, dryRun)) + .compile + .drain + + private def updateElastic(result: Games.Result, dryRun: Boolean): IO[Unit] = dryRun.fold( info"Would index total ${result.toIndex.size} games and delete ${result.toDelete.size} games" *> result.toIndex.traverse_(x => debug"Would index $x") @@ -46,11 +57,6 @@ object GameIngestor: *> saveLastIndexedTimestamp(result.timestamp.getOrElse(Instant.now)) ) - def run(since: Instant, until: Instant, dryRun: Boolean): fs2.Stream[IO, Unit] = - games - .fetch(since, until) - .evalMap(updateElastic(_, dryRun)) - private def storeBulk(sources: List[(String, GameSource)]): IO[Unit] = info"Received ${sources.size} ${index.value}s to index" *> elastic diff --git a/modules/ingestor/src/main/scala/ingestor.scala b/modules/ingestor/src/main/scala/ingestor.scala index 529b60a6..f0338c72 100644 --- a/modules/ingestor/src/main/scala/ingestor.scala +++ b/modules/ingestor/src/main/scala/ingestor.scala @@ -30,10 +30,5 @@ object Ingestor: val game = GameIngestor(games, elastic, store, config.game) val team = TeamIngestor(teams, elastic, store, config.team) new Ingestor: - def run() = - fs2 - .Stream(forum.watch, team.watch, study.watch, game.watch) - .covary[IO] - .parJoinUnbounded - .compile - .drain + def run(): IO[Unit] = + List(forum.watch, team.watch, study.watch, game.watch).parSequence_ diff --git a/modules/ingestor/src/main/scala/ingestor.study.scala b/modules/ingestor/src/main/scala/ingestor.study.scala index 52939400..4678be4d 100644 --- a/modules/ingestor/src/main/scala/ingestor.study.scala +++ b/modules/ingestor/src/main/scala/ingestor.study.scala @@ -10,8 +10,8 @@ import java.time.Instant trait StudyIngestor: // pull changes from study MongoDB and ingest into elastic search - def watch: fs2.Stream[IO, Unit] - def run(since: Instant, until: Instant, dryRun: Boolean): fs2.Stream[IO, Unit] + def watch: IO[Unit] + def run(since: Instant, until: Instant, dryRun: Boolean): IO[Unit] object StudyIngestor: @@ -24,7 +24,7 @@ object StudyIngestor: config: IngestorConfig.Study )(using LoggerFactory[IO]): StudyIngestor = new: given Logger[IO] = LoggerFactory[IO].getLogger - def watch: fs2.Stream[IO, Unit] = + def watch: IO[Unit] = fs2.Stream .eval( config.startAt.fold(store.get(index.value))(_.some.pure[IO]) @@ -35,8 +35,10 @@ object StudyIngestor: .evalMap: result => storeBulk(result.toIndex, false) *> elastic.deleteMany(index, result.toDelete) *> saveLastIndexedTimestamp(result.timestamp.getOrElse(Instant.now())) + .compile + .drain - def run(since: Instant, until: Instant, dryRun: Boolean): fs2.Stream[IO, Unit] = + def run(since: Instant, until: Instant, dryRun: Boolean): IO[Unit] = studies .fetch(since, until) .evalMap: result => @@ -45,6 +47,8 @@ object StudyIngestor: *> result.toDelete.traverse_(doc => debug"Would delete $doc"), storeBulk(result.toIndex, dryRun) *> elastic.deleteMany(index, result.toDelete) ) + .compile + .drain def storeBulk(sources: List[(String, StudySource)], dryRun: Boolean = false): IO[Unit] = info"Received ${sources.size} studies to index" *> diff --git a/modules/ingestor/src/main/scala/ingestor.team.scala b/modules/ingestor/src/main/scala/ingestor.team.scala index 4038836e..72e7132c 100644 --- a/modules/ingestor/src/main/scala/ingestor.team.scala +++ b/modules/ingestor/src/main/scala/ingestor.team.scala @@ -10,9 +10,9 @@ import java.time.Instant trait TeamIngestor: // watch change events from MongoDB and ingest team data into elastic search - def watch: fs2.Stream[IO, Unit] + def watch: IO[Unit] // Fetch teams in [since, until] and ingest into elastic search - def run(since: Instant, until: Instant, dryRun: Boolean): fs2.Stream[IO, Unit] + def run(since: Instant, until: Instant, dryRun: Boolean): IO[Unit] object TeamIngestor: @@ -32,6 +32,8 @@ object TeamIngestor: storeBulk(result.toIndex) *> elastic.deleteMany(index, result.toDelete) *> saveLastIndexedTimestamp(result.timestamp.getOrElse(Instant.now)) + .compile + .drain def run(since: Instant, until: Instant, dryRun: Boolean) = teams @@ -42,6 +44,8 @@ object TeamIngestor: *> result.toDelete.traverse_(doc => debug"Would delete $doc"), storeBulk(result.toIndex) *> elastic.deleteMany(index, result.toDelete) ) + .compile + .drain private def storeBulk(sources: List[Teams.SourceWithId]): IO[Unit] = info"Received ${sources.size} teams to index" *> From a8b1302990767b2740c9d875e24bc2cb2c581671 Mon Sep 17 00:00:00 2001 From: Thanh Le Date: Wed, 27 Nov 2024 16:40:41 +0100 Subject: [PATCH 06/18] Get logger from logger factory --- modules/app/src/main/scala/app.scala | 4 ++-- modules/ingestor/src/main/scala/app.scala | 4 ++-- modules/ingestor/src/main/scala/cli.scala | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/modules/app/src/main/scala/app.scala b/modules/app/src/main/scala/app.scala index ce81afc8..6c5e4b63 100644 --- a/modules/app/src/main/scala/app.scala +++ b/modules/app/src/main/scala/app.scala @@ -3,7 +3,7 @@ package app import cats.effect.* import cats.syntax.all.* -import org.typelevel.log4cats.slf4j.{ Slf4jFactory, Slf4jLogger } +import org.typelevel.log4cats.slf4j.Slf4jFactory import org.typelevel.log4cats.{ Logger, LoggerFactory } import org.typelevel.otel4s.experimental.metrics.* import org.typelevel.otel4s.metrics.Meter @@ -14,8 +14,8 @@ import org.typelevel.otel4s.sdk.metrics.exporter.MetricExporter object App extends IOApp.Simple: - given Logger[IO] = Slf4jLogger.getLogger[IO] given LoggerFactory[IO] = Slf4jFactory.create[IO] + given Logger[IO] = LoggerFactory[IO].getLogger override def run: IO[Unit] = app.useForever diff --git a/modules/ingestor/src/main/scala/app.scala b/modules/ingestor/src/main/scala/app.scala index acf52cd4..5ec234db 100644 --- a/modules/ingestor/src/main/scala/app.scala +++ b/modules/ingestor/src/main/scala/app.scala @@ -2,7 +2,7 @@ package lila.search package ingestor import cats.effect.* -import org.typelevel.log4cats.slf4j.{ Slf4jFactory, Slf4jLogger } +import org.typelevel.log4cats.slf4j.Slf4jFactory import org.typelevel.log4cats.{ Logger, LoggerFactory } import org.typelevel.otel4s.experimental.metrics.* import org.typelevel.otel4s.metrics.Meter @@ -11,8 +11,8 @@ import org.typelevel.otel4s.sdk.metrics.SdkMetrics object App extends IOApp.Simple: - given Logger[IO] = Slf4jLogger.getLogger[IO] given LoggerFactory[IO] = Slf4jFactory.create[IO] + given Logger[IO] = LoggerFactory[IO].getLogger override def run: IO[Unit] = app.useForever diff --git a/modules/ingestor/src/main/scala/cli.scala b/modules/ingestor/src/main/scala/cli.scala index 492d6179..c43067f2 100644 --- a/modules/ingestor/src/main/scala/cli.scala +++ b/modules/ingestor/src/main/scala/cli.scala @@ -7,7 +7,7 @@ import cats.syntax.all.* import com.monovore.decline.* import com.monovore.decline.effect.* import lila.search.ingestor.opts.{ IndexOpts, WatchOpts } -import org.typelevel.log4cats.slf4j.{ Slf4jFactory, Slf4jLogger } +import org.typelevel.log4cats.slf4j.{ Slf4jFactory } import org.typelevel.log4cats.{ Logger, LoggerFactory } import org.typelevel.otel4s.metrics.Meter @@ -20,8 +20,8 @@ object cli version = "3.0.0" ): - given Logger[IO] = Slf4jLogger.getLogger[IO] given LoggerFactory[IO] = Slf4jFactory.create[IO] + given Logger[IO] = LoggerFactory[IO].getLogger given Meter[IO] = Meter.noop[IO] override def main: Opts[IO[ExitCode]] = From a2d6953d76879ba6676ae9920dceea2917613ebe Mon Sep 17 00:00:00 2001 From: Thanh Le Date: Wed, 27 Nov 2024 17:00:02 +0100 Subject: [PATCH 07/18] Simplify elastic operations --- modules/ingestor/src/main/scala/cli.scala | 11 +++--- .../src/main/scala/ingestor.forum.scala | 18 ++++------ .../src/main/scala/ingestor.game.scala | 20 ++++------- .../ingestor/src/main/scala/ingestor.scala | 9 ++--- .../src/main/scala/ingestor.study.scala | 17 ++-------- .../src/main/scala/ingestor.team.scala | 20 ++++------- modules/ingestor/src/main/scala/package.scala | 34 +++++++++---------- 7 files changed, 48 insertions(+), 81 deletions(-) diff --git a/modules/ingestor/src/main/scala/cli.scala b/modules/ingestor/src/main/scala/cli.scala index c43067f2..13d7713f 100644 --- a/modules/ingestor/src/main/scala/cli.scala +++ b/modules/ingestor/src/main/scala/cli.scala @@ -7,7 +7,7 @@ import cats.syntax.all.* import com.monovore.decline.* import com.monovore.decline.effect.* import lila.search.ingestor.opts.{ IndexOpts, WatchOpts } -import org.typelevel.log4cats.slf4j.{ Slf4jFactory } +import org.typelevel.log4cats.slf4j.Slf4jFactory import org.typelevel.log4cats.{ Logger, LoggerFactory } import org.typelevel.otel4s.metrics.Meter @@ -32,14 +32,15 @@ object cli for config <- AppConfig.load.toResource res <- AppResources.instance(config) + given ESClient[IO] = res.elastic forums <- Forums(res.lichess, config.ingestor.forum).toResource - forum = ForumIngestor(forums, res.elastic, res.store, config.ingestor.forum) + forum = ForumIngestor(forums, res.store, config.ingestor.forum) studies <- Studies(res.study, res.studyLocal, config.ingestor.study).toResource - study = StudyIngestor(studies, res.elastic, res.store, config.ingestor.study) + study = StudyIngestor(studies, res.store, config.ingestor.study) games <- Games(res.lichess, config.ingestor.game).toResource - game = GameIngestor(games, res.elastic, res.store, config.ingestor.game) + game = GameIngestor(games, res.store, config.ingestor.game) teams <- Teams(res.lichess, config.ingestor.team).toResource - team = TeamIngestor(teams, res.elastic, res.store, config.ingestor.team) + team = TeamIngestor(teams, res.store, config.ingestor.team) yield Executor(forum, study, game, team) class Executor( diff --git a/modules/ingestor/src/main/scala/ingestor.forum.scala b/modules/ingestor/src/main/scala/ingestor.forum.scala index db2fa7fd..d01d922d 100644 --- a/modules/ingestor/src/main/scala/ingestor.forum.scala +++ b/modules/ingestor/src/main/scala/ingestor.forum.scala @@ -18,8 +18,9 @@ object ForumIngestor: private val index = Index.Forum - def apply(forums: Forums, elastic: ESClient[IO], store: KVStore, config: IngestorConfig.Forum)(using - LoggerFactory[IO] + def apply(forums: Forums, store: KVStore, config: IngestorConfig.Forum)(using + LoggerFactory[IO], + ESClient[IO] ): ForumIngestor = new: given Logger[IO] = LoggerFactory[IO].getLogger @@ -31,8 +32,8 @@ object ForumIngestor: forums .watch(last) .evalMap: result => - storeBulk(result.toIndex) - *> elastic.deleteMany(index, result.toDelete) + storeBulk(index, result.toIndex) + *> deleteMany(index, result.toDelete) *> saveLastIndexedTimestamp(result.timestamp.getOrElse(Instant.now())) .compile .drain @@ -44,18 +45,11 @@ object ForumIngestor: dryRun.fold( result.toIndex.traverse_(doc => debug"Would index $doc") *> result.toDelete.traverse_(doc => debug"Would delete $doc"), - storeBulk(result.toIndex) *> elastic.deleteMany(index, result.toDelete) + storeBulk(index, result.toIndex) *> deleteMany(index, result.toDelete) ) .compile .drain - private def storeBulk(docs: List[(String, ForumSource)]): IO[Unit] = - info"Received ${docs.size} forum posts to index" *> - elastic.storeBulk(index, docs) *> info"Indexed ${docs.size} forum posts" - .handleErrorWith: e => - Logger[IO].error(e)(s"Failed to index forum posts: ${docs.map(_._1).mkString(", ")}") - .whenA(docs.nonEmpty) - private def saveLastIndexedTimestamp(time: Instant): IO[Unit] = store.put(index.value, time) *> info"Stored last indexed time ${time.getEpochSecond} for $index" diff --git a/modules/ingestor/src/main/scala/ingestor.game.scala b/modules/ingestor/src/main/scala/ingestor.game.scala index 8037a4d9..b61df57c 100644 --- a/modules/ingestor/src/main/scala/ingestor.game.scala +++ b/modules/ingestor/src/main/scala/ingestor.game.scala @@ -20,15 +20,16 @@ object GameIngestor: private val index = Index.Game - def apply(games: Games, elastic: ESClient[IO], store: KVStore, config: IngestorConfig.Game)(using - LoggerFactory[IO] + def apply(games: Games, store: KVStore, config: IngestorConfig.Game)(using + LoggerFactory[IO], + ESClient[IO] ): GameIngestor = new: given Logger[IO] = LoggerFactory[IO].getLogger def watch: IO[Unit] = fs2.Stream .eval(startAt.flatTap(since => info"Starting game ingestor from $since")) - .flatMap(games.watch(_)) + .flatMap(games.watch) .evalMap(updateElastic(_, false)) .compile .drain @@ -52,20 +53,11 @@ object GameIngestor: info"Would index total ${result.toIndex.size} games and delete ${result.toDelete.size} games" *> result.toIndex.traverse_(x => debug"Would index $x") *> result.toDelete.traverse_(x => debug"Would delete $x"), - storeBulk(result.toIndex) - *> elastic.deleteMany(index, result.toDelete) + storeBulk(index, result.toIndex) + *> deleteMany(index, result.toDelete) *> saveLastIndexedTimestamp(result.timestamp.getOrElse(Instant.now)) ) - private def storeBulk(sources: List[(String, GameSource)]): IO[Unit] = - info"Received ${sources.size} ${index.value}s to index" *> - elastic - .storeBulk(index, sources) - .handleErrorWith: e => - Logger[IO].error(e)(s"Failed to index ${index.value}s: ${sources.map(_._1).mkString(", ")}") - .whenA(sources.nonEmpty) - *> info"Indexed ${sources.size} ${index.value}s" - private def saveLastIndexedTimestamp(time: Instant): IO[Unit] = store.put(index.value, time) *> info"Stored last indexed time ${time.getEpochSecond} for $index" diff --git a/modules/ingestor/src/main/scala/ingestor.scala b/modules/ingestor/src/main/scala/ingestor.scala index f0338c72..658f72f2 100644 --- a/modules/ingestor/src/main/scala/ingestor.scala +++ b/modules/ingestor/src/main/scala/ingestor.scala @@ -25,10 +25,11 @@ object Ingestor: Games(lichess, config.game), Teams(lichess, config.team) ).mapN: (forums, studies, games, teams) => - val forum = ForumIngestor(forums, elastic, store, config.forum) - val study = StudyIngestor(studies, elastic, store, config.study) - val game = GameIngestor(games, elastic, store, config.game) - val team = TeamIngestor(teams, elastic, store, config.team) + given ESClient[IO] = elastic + val forum = ForumIngestor(forums, store, config.forum) + val study = StudyIngestor(studies, store, config.study) + val game = GameIngestor(games, store, config.game) + val team = TeamIngestor(teams, store, config.team) new Ingestor: def run(): IO[Unit] = List(forum.watch, team.watch, study.watch, game.watch).parSequence_ diff --git a/modules/ingestor/src/main/scala/ingestor.study.scala b/modules/ingestor/src/main/scala/ingestor.study.scala index 4678be4d..693a554b 100644 --- a/modules/ingestor/src/main/scala/ingestor.study.scala +++ b/modules/ingestor/src/main/scala/ingestor.study.scala @@ -19,10 +19,9 @@ object StudyIngestor: def apply( studies: Studies, - elastic: ESClient[IO], store: KVStore, config: IngestorConfig.Study - )(using LoggerFactory[IO]): StudyIngestor = new: + )(using LoggerFactory[IO], ESClient[IO]): StudyIngestor = new: given Logger[IO] = LoggerFactory[IO].getLogger def watch: IO[Unit] = fs2.Stream @@ -33,7 +32,7 @@ object StudyIngestor: studies .watch(since) .evalMap: result => - storeBulk(result.toIndex, false) *> elastic.deleteMany(index, result.toDelete) + storeBulk(index, result.toIndex) *> deleteMany(index, result.toDelete) *> saveLastIndexedTimestamp(result.timestamp.getOrElse(Instant.now())) .compile .drain @@ -45,21 +44,11 @@ object StudyIngestor: dryRun.fold( result.toIndex.traverse_(doc => debug"Would index $doc") *> result.toDelete.traverse_(doc => debug"Would delete $doc"), - storeBulk(result.toIndex, dryRun) *> elastic.deleteMany(index, result.toDelete) + storeBulk(index, result.toIndex) *> deleteMany(index, result.toDelete) ) .compile .drain - def storeBulk(sources: List[(String, StudySource)], dryRun: Boolean = false): IO[Unit] = - info"Received ${sources.size} studies to index" *> - dryRun.fold( - sources.traverse_(source => debug"Would index $source"), - elastic.storeBulk(index, sources) *> info"Indexed ${sources.size} studies" - .handleErrorWith: e => - Logger[IO].error(e)(s"Failed to index studies: ${sources.map(_._1).mkString(", ")}") - .whenA(sources.nonEmpty) - ) - def saveLastIndexedTimestamp(time: Instant): IO[Unit] = store.put(index.value, time) *> info"Stored last indexed time ${time.getEpochSecond} for $index" diff --git a/modules/ingestor/src/main/scala/ingestor.team.scala b/modules/ingestor/src/main/scala/ingestor.team.scala index 72e7132c..e7a212bb 100644 --- a/modules/ingestor/src/main/scala/ingestor.team.scala +++ b/modules/ingestor/src/main/scala/ingestor.team.scala @@ -18,8 +18,9 @@ object TeamIngestor: private val index = Index.Team - def apply(teams: Teams, elastic: ESClient[IO], store: KVStore, config: IngestorConfig.Team)(using - LoggerFactory[IO] + def apply(teams: Teams, store: KVStore, config: IngestorConfig.Team)(using + LoggerFactory[IO], + ESClient[IO] ): TeamIngestor = new: given Logger[IO] = summon[LoggerFactory[IO]].getLogger def watch = @@ -29,8 +30,8 @@ object TeamIngestor: teams .watch(last) .evalMap: result => - storeBulk(result.toIndex) - *> elastic.deleteMany(index, result.toDelete) + storeBulk(index, result.toIndex) + *> deleteMany(index, result.toDelete) *> saveLastIndexedTimestamp(result.timestamp.getOrElse(Instant.now)) .compile .drain @@ -42,20 +43,11 @@ object TeamIngestor: dryRun.fold( result.toIndex.traverse_(doc => debug"Would index $doc") *> result.toDelete.traverse_(doc => debug"Would delete $doc"), - storeBulk(result.toIndex) *> elastic.deleteMany(index, result.toDelete) + storeBulk(index, result.toIndex) *> deleteMany(index, result.toDelete) ) .compile .drain - private def storeBulk(sources: List[Teams.SourceWithId]): IO[Unit] = - info"Received ${sources.size} teams to index" *> - elastic - .storeBulk(index, sources) - .handleErrorWith: e => - Logger[IO].error(e)(s"Failed to index teams: ${sources.map(_._1).mkString(", ")}") - .whenA(sources.nonEmpty) - *> info"Indexed ${sources.size} teams" - private def saveLastIndexedTimestamp(time: Instant): IO[Unit] = store.put(index.value, time) *> info"Stored last indexed time ${time.getEpochSecond} for $index" diff --git a/modules/ingestor/src/main/scala/package.scala b/modules/ingestor/src/main/scala/package.scala index ba2b4287..f4b6e5e2 100644 --- a/modules/ingestor/src/main/scala/package.scala +++ b/modules/ingestor/src/main/scala/package.scala @@ -11,7 +11,6 @@ import mongo4cats.models.collection.ChangeStreamDocument import mongo4cats.operations.Filter import org.bson.BsonTimestamp import org.typelevel.log4cats.Logger -import org.typelevel.log4cats.syntax.* import smithy4s.json.Json.given import smithy4s.schema.Schema @@ -39,24 +38,23 @@ def range(field: String)(since: Instant, until: Option[Instant]): Filter = inline def gtes = Filter.gte(field, since) until.fold(gtes)(until => gtes.and(Filter.lt(field, until))) -extension (elastic: ESClient[IO]) - - def deleteMany_(index: Index, ids: List[Id])(using Logger[IO]): IO[Unit] = +def deleteMany(index: Index, ids: List[Id])(using Logger[IO])(using elastic: ESClient[IO]): IO[Unit] = + elastic + .deleteMany(index, ids) + .flatTap(_ => Logger[IO].info(s"Deleted ${ids.size} ${index.value}s")) + .handleErrorWith: e => + Logger[IO].error(e)(s"Failed to delete ${index.value}: ${ids.map(_.value).mkString(", ")}") + .whenA(ids.nonEmpty) + +def storeBulk[A](index: Index, sources: List[(String, A)])(using Schema[A], Logger[IO])(using + elastic: ESClient[IO] +): IO[Unit] = + Logger[IO].info(s"Received ${sources.size} docs to ${index.value}") *> elastic - .deleteMany(index, ids) - .flatTap(_ => Logger[IO].info(s"Deleted ${ids.size} ${index.value}s")) + .storeBulk(index, sources) .handleErrorWith: e => - Logger[IO].error(e)(s"Failed to delete ${index.value}: ${ids.map(_.value).mkString(", ")}") - .whenA(ids.nonEmpty) - - @scala.annotation.targetName("deleteManyWithDocs") - def deleteMany(index: Index, events: List[Document])(using Logger[IO]): IO[Unit] = - info"Received ${events.size} ${index.value} to delete" *> - deleteMany_(index, events.flatMap(_.id).map(Id.apply)).whenA(events.nonEmpty) - - @scala.annotation.targetName("deleteManyWithChanges") - def deleteMany[A](index: Index, events: List[ChangeStreamDocument[A]])(using Logger[IO]): IO[Unit] = - info"Received ${events.size} ${index.value} to delete" *> - deleteMany_(index, events.flatMap(_.docId).map(Id.apply)).whenA(events.nonEmpty) + Logger[IO].error(e)(s"Failed to ${index.value} index: ${sources.map(_._1).mkString(", ")}") + .whenA(sources.nonEmpty) + *> Logger[IO].info(s"Indexed ${sources.size} ${index.value}s") extension (s: String) def dollarPrefix = "$" + s From bf29742afdbddf8fcff061b78416f445d311dd61 Mon Sep 17 00:00:00 2001 From: Thanh Le Date: Wed, 27 Nov 2024 17:51:00 +0100 Subject: [PATCH 08/18] Use Ingestor for cli as well --- modules/ingestor/src/main/scala/app.scala | 3 +- modules/ingestor/src/main/scala/cli.scala | 42 +++++++------------ .../ingestor/src/main/scala/ingestor.scala | 27 ++++++------ 3 files changed, 32 insertions(+), 40 deletions(-) diff --git a/modules/ingestor/src/main/scala/app.scala b/modules/ingestor/src/main/scala/app.scala index 5ec234db..3a84ae20 100644 --- a/modules/ingestor/src/main/scala/app.scala +++ b/modules/ingestor/src/main/scala/app.scala @@ -33,7 +33,8 @@ object App extends IOApp.Simple: class IngestorApp(res: AppResources, config: AppConfig)(using Logger[IO], LoggerFactory[IO]): def run(): Resource[IO, Unit] = - Ingestor(res.lichess, res.study, res.studyLocal, res.elastic, res.store, config.ingestor) + given ESClient[IO] = res.elastic + Ingestor(res.lichess, res.study, res.studyLocal, res.store, config.ingestor) .flatMap(_.run()) .toResource .evalTap(_ => Logger[IO].info("Ingestor started")) diff --git a/modules/ingestor/src/main/scala/cli.scala b/modules/ingestor/src/main/scala/cli.scala index 13d7713f..a0fab8c3 100644 --- a/modules/ingestor/src/main/scala/cli.scala +++ b/modules/ingestor/src/main/scala/cli.scala @@ -26,29 +26,17 @@ object cli override def main: Opts[IO[ExitCode]] = opts.parse.map: opts => - makeExecutor.use(_.execute(opts).as(ExitCode.Success)) + makeIngestor.use(_.execute(opts).as(ExitCode.Success)) - def makeExecutor: Resource[IO, Executor] = + def makeIngestor: Resource[IO, Ingestor] = for config <- AppConfig.load.toResource res <- AppResources.instance(config) given ESClient[IO] = res.elastic - forums <- Forums(res.lichess, config.ingestor.forum).toResource - forum = ForumIngestor(forums, res.store, config.ingestor.forum) - studies <- Studies(res.study, res.studyLocal, config.ingestor.study).toResource - study = StudyIngestor(studies, res.store, config.ingestor.study) - games <- Games(res.lichess, config.ingestor.game).toResource - game = GameIngestor(games, res.store, config.ingestor.game) - teams <- Teams(res.lichess, config.ingestor.team).toResource - team = TeamIngestor(teams, res.store, config.ingestor.team) - yield Executor(forum, study, game, team) - - class Executor( - val forum: ForumIngestor, - val study: StudyIngestor, - val game: GameIngestor, - val team: TeamIngestor - ): + ingestor <- Ingestor(res.lichess, res.study, res.studyLocal, res.store, config.ingestor).toResource + yield ingestor + + extension (ingestor: Ingestor) def execute(opts: IndexOpts | WatchOpts): IO[Unit] = opts match case opts: IndexOpts => index(opts) @@ -57,23 +45,23 @@ object cli def index(opts: IndexOpts): IO[Unit] = opts.index match case Index.Forum => - forum.run(opts.since, opts.until, opts.dry) + ingestor.forum.run(opts.since, opts.until, opts.dry) case Index.Study => - study.run(opts.since, opts.until, opts.dry) + ingestor.study.run(opts.since, opts.until, opts.dry) case Index.Game => - game.run(opts.since, opts.until, opts.dry) + ingestor.game.run(opts.since, opts.until, opts.dry) case Index.Team => - team.run(opts.since, opts.until, opts.dry) + ingestor.team.run(opts.since, opts.until, opts.dry) case _ => - forum.run(opts.since, opts.until, opts.dry) *> - study.run(opts.since, opts.until, opts.dry) *> - game.run(opts.since, opts.until, opts.dry) *> - team.run(opts.since, opts.until, opts.dry) + ingestor.forum.run(opts.since, opts.until, opts.dry) *> + ingestor.study.run(opts.since, opts.until, opts.dry) *> + ingestor.game.run(opts.since, opts.until, opts.dry) *> + ingestor.team.run(opts.since, opts.until, opts.dry) def watch(opts: WatchOpts): IO[Unit] = opts.index match case Index.Game => - game.watch(opts.since.some, opts.dry) + ingestor.game.watch(opts.since.some, opts.dry) case _ => IO.println("We only support game watch for now") object opts: diff --git a/modules/ingestor/src/main/scala/ingestor.scala b/modules/ingestor/src/main/scala/ingestor.scala index 658f72f2..335a0b71 100644 --- a/modules/ingestor/src/main/scala/ingestor.scala +++ b/modules/ingestor/src/main/scala/ingestor.scala @@ -6,8 +6,14 @@ import cats.syntax.all.* import mongo4cats.database.MongoDatabase import org.typelevel.log4cats.LoggerFactory -trait Ingestor: - def run(): IO[Unit] +class Ingestor( + val forum: ForumIngestor, + val study: StudyIngestor, + val game: GameIngestor, + val team: TeamIngestor +): + def run(): IO[Unit] = + List(forum.watch, team.watch, study.watch, game.watch).parSequence_ object Ingestor: @@ -15,21 +21,18 @@ object Ingestor: lichess: MongoDatabase[IO], study: MongoDatabase[IO], local: MongoDatabase[IO], - elastic: ESClient[IO], store: KVStore, config: IngestorConfig - )(using LoggerFactory[IO]): IO[Ingestor] = + )(using LoggerFactory[IO], ESClient[IO]): IO[Ingestor] = ( Forums(lichess, config.forum), Studies(study, local, config.study), Games(lichess, config.game), Teams(lichess, config.team) ).mapN: (forums, studies, games, teams) => - given ESClient[IO] = elastic - val forum = ForumIngestor(forums, store, config.forum) - val study = StudyIngestor(studies, store, config.study) - val game = GameIngestor(games, store, config.game) - val team = TeamIngestor(teams, store, config.team) - new Ingestor: - def run(): IO[Unit] = - List(forum.watch, team.watch, study.watch, game.watch).parSequence_ + new Ingestor( + ForumIngestor(forums, store, config.forum), + StudyIngestor(studies, store, config.study), + GameIngestor(games, store, config.game), + TeamIngestor(teams, store, config.team) + ) From 8322327527ac229e67fbb00ba18f7d4504577fdd Mon Sep 17 00:00:00 2001 From: Thanh Le Date: Wed, 27 Nov 2024 18:31:51 +0100 Subject: [PATCH 09/18] Refactor ingestors --- .../src/main/scala/ingestor.forum.scala | 22 ++++++++----------- .../src/main/scala/ingestor.game.scala | 13 +++-------- .../src/main/scala/ingestor.study.scala | 22 +++++++++---------- .../src/main/scala/ingestor.team.scala | 22 ++++++++----------- modules/ingestor/src/main/scala/package.scala | 6 +++++ 5 files changed, 37 insertions(+), 48 deletions(-) diff --git a/modules/ingestor/src/main/scala/ingestor.forum.scala b/modules/ingestor/src/main/scala/ingestor.forum.scala index d01d922d..91a861a5 100644 --- a/modules/ingestor/src/main/scala/ingestor.forum.scala +++ b/modules/ingestor/src/main/scala/ingestor.forum.scala @@ -27,14 +27,12 @@ object ForumIngestor: def watch: IO[Unit] = fs2.Stream - .eval(startAt.flatTap(since => info"Starting forum ingestor from $since")) - .flatMap: last => - forums - .watch(last) - .evalMap: result => - storeBulk(index, result.toIndex) - *> deleteMany(index, result.toDelete) - *> saveLastIndexedTimestamp(result.timestamp.getOrElse(Instant.now())) + .eval(startAt) + .flatMap(forums.watch) + .evalMap: result => + storeBulk(index, result.toIndex) + *> deleteMany(index, result.toDelete) + *> store.saveLastIndexedTimestamp(index, result.timestamp) .compile .drain @@ -50,9 +48,7 @@ object ForumIngestor: .compile .drain - private def saveLastIndexedTimestamp(time: Instant): IO[Unit] = - store.put(index.value, time) - *> info"Stored last indexed time ${time.getEpochSecond} for $index" - private def startAt: IO[Option[Instant]] = - config.startAt.fold(store.get(index.value))(_.some.pure[IO]) + config.startAt + .fold(store.get(index.value))(_.some.pure[IO]) + .flatTap(since => info"Starting forum ingestor from $since") diff --git a/modules/ingestor/src/main/scala/ingestor.game.scala b/modules/ingestor/src/main/scala/ingestor.game.scala index b61df57c..8a5687dd 100644 --- a/modules/ingestor/src/main/scala/ingestor.game.scala +++ b/modules/ingestor/src/main/scala/ingestor.game.scala @@ -28,9 +28,10 @@ object GameIngestor: def watch: IO[Unit] = fs2.Stream - .eval(startAt.flatTap(since => info"Starting game ingestor from $since")) + .eval(store.startAt(index, config.startAt).flatTap(since => info"Starting game ingestor from $since")) .flatMap(games.watch) - .evalMap(updateElastic(_, false)) + .evalMap: result => + updateElastic(result, false) *> store.saveLastIndexedTimestamp(index, result.timestamp) .compile .drain @@ -55,12 +56,4 @@ object GameIngestor: *> result.toDelete.traverse_(x => debug"Would delete $x"), storeBulk(index, result.toIndex) *> deleteMany(index, result.toDelete) - *> saveLastIndexedTimestamp(result.timestamp.getOrElse(Instant.now)) ) - - private def saveLastIndexedTimestamp(time: Instant): IO[Unit] = - store.put(index.value, time) - *> info"Stored last indexed time ${time.getEpochSecond} for $index" - - private def startAt: IO[Option[Instant]] = - config.startAt.fold(store.get(index.value))(_.some.pure[IO]) diff --git a/modules/ingestor/src/main/scala/ingestor.study.scala b/modules/ingestor/src/main/scala/ingestor.study.scala index 693a554b..08e1ca87 100644 --- a/modules/ingestor/src/main/scala/ingestor.study.scala +++ b/modules/ingestor/src/main/scala/ingestor.study.scala @@ -25,15 +25,12 @@ object StudyIngestor: given Logger[IO] = LoggerFactory[IO].getLogger def watch: IO[Unit] = fs2.Stream - .eval( - config.startAt.fold(store.get(index.value))(_.some.pure[IO]) - ) - .flatMap: since => - studies - .watch(since) - .evalMap: result => - storeBulk(index, result.toIndex) *> deleteMany(index, result.toDelete) - *> saveLastIndexedTimestamp(result.timestamp.getOrElse(Instant.now())) + .eval(startAt) + .flatMap(studies.watch) + .evalMap: result => + storeBulk(index, result.toIndex) + *> deleteMany(index, result.toDelete) + *> store.saveLastIndexedTimestamp(index, result.timestamp) .compile .drain @@ -49,6 +46,7 @@ object StudyIngestor: .compile .drain - def saveLastIndexedTimestamp(time: Instant): IO[Unit] = - store.put(index.value, time) - *> info"Stored last indexed time ${time.getEpochSecond} for $index" + private def startAt: IO[Option[Instant]] = + config.startAt + .fold(store.get(index.value))(_.some.pure[IO]) + .flatTap(since => info"Starting forum ingestor from $since") diff --git a/modules/ingestor/src/main/scala/ingestor.team.scala b/modules/ingestor/src/main/scala/ingestor.team.scala index e7a212bb..33502c17 100644 --- a/modules/ingestor/src/main/scala/ingestor.team.scala +++ b/modules/ingestor/src/main/scala/ingestor.team.scala @@ -25,14 +25,12 @@ object TeamIngestor: given Logger[IO] = summon[LoggerFactory[IO]].getLogger def watch = fs2.Stream - .eval(startAt.flatTap(since => info"Starting team ingestor from $since")) - .flatMap: last => - teams - .watch(last) - .evalMap: result => - storeBulk(index, result.toIndex) - *> deleteMany(index, result.toDelete) - *> saveLastIndexedTimestamp(result.timestamp.getOrElse(Instant.now)) + .eval(startAt) + .flatMap(teams.watch) + .evalMap: result => + storeBulk(index, result.toIndex) + *> deleteMany(index, result.toDelete) + *> store.saveLastIndexedTimestamp(index, result.timestamp) .compile .drain @@ -48,9 +46,7 @@ object TeamIngestor: .compile .drain - private def saveLastIndexedTimestamp(time: Instant): IO[Unit] = - store.put(index.value, time) - *> info"Stored last indexed time ${time.getEpochSecond} for $index" - private def startAt: IO[Option[Instant]] = - config.startAt.fold(store.get(index.value))(_.some.pure[IO]) + config.startAt + .fold(store.get(index.value))(_.some.pure[IO]) + .flatTap(since => info"Starting team ingestor from $since") diff --git a/modules/ingestor/src/main/scala/package.scala b/modules/ingestor/src/main/scala/package.scala index f4b6e5e2..a0c29859 100644 --- a/modules/ingestor/src/main/scala/package.scala +++ b/modules/ingestor/src/main/scala/package.scala @@ -58,3 +58,9 @@ def storeBulk[A](index: Index, sources: List[(String, A)])(using Schema[A], Logg *> Logger[IO].info(s"Indexed ${sources.size} ${index.value}s") extension (s: String) def dollarPrefix = "$" + s + +extension (store: KVStore) + def saveLastIndexedTimestamp(index: Index, time: Option[Instant])(using Logger[IO]): IO[Unit] = + val savedTime = time.getOrElse(Instant.now()) + store.put(index.value, savedTime) + *> Logger[IO].info(s"Stored last indexed time ${savedTime.getEpochSecond} for $index") From 7b03b0163624c70d97ec341966de05e114608cd8 Mon Sep 17 00:00:00 2001 From: Thanh Le Date: Wed, 27 Nov 2024 18:52:41 +0100 Subject: [PATCH 10/18] No need to summon --- modules/app/src/main/scala/service.health.scala | 2 +- modules/app/src/main/scala/service.search.scala | 9 +-------- modules/ingestor/src/main/scala/ingestor.game.scala | 7 ++++++- modules/ingestor/src/main/scala/ingestor.team.scala | 2 +- modules/ingestor/src/main/scala/mongo.game.scala | 2 +- modules/ingestor/src/main/scala/mongo.study.scala | 2 +- modules/ingestor/src/main/scala/mongo.team.scala | 2 +- 7 files changed, 12 insertions(+), 14 deletions(-) diff --git a/modules/app/src/main/scala/service.health.scala b/modules/app/src/main/scala/service.health.scala index bbde604e..2956456c 100644 --- a/modules/app/src/main/scala/service.health.scala +++ b/modules/app/src/main/scala/service.health.scala @@ -8,7 +8,7 @@ import org.typelevel.log4cats.{ Logger, LoggerFactory } class HealthServiceImpl(esClient: ESClient[IO])(using LoggerFactory[IO]) extends HealthService[IO]: - given logger: Logger[IO] = summon[LoggerFactory[IO]].getLogger + given logger: Logger[IO] = LoggerFactory[IO].getLogger override def healthCheck(): IO[HealthCheckOutput] = esClient.status diff --git a/modules/app/src/main/scala/service.search.scala b/modules/app/src/main/scala/service.search.scala index 0b8a5faf..135c19a6 100644 --- a/modules/app/src/main/scala/service.search.scala +++ b/modules/app/src/main/scala/service.search.scala @@ -2,7 +2,6 @@ package lila.search package app import cats.effect.* -import com.sksamuel.elastic4s.Indexable import io.github.arainko.ducktape.* import lila.search.forum.Forum import lila.search.game.Game @@ -11,7 +10,6 @@ import lila.search.study.Study import lila.search.team.Team import org.typelevel.log4cats.{ Logger, LoggerFactory } import smithy4s.Timestamp -import smithy4s.schema.Schema import java.time.Instant @@ -19,7 +17,7 @@ class SearchServiceImpl(esClient: ESClient[IO])(using LoggerFactory[IO]) extends import SearchServiceImpl.given - given logger: Logger[IO] = summon[LoggerFactory[IO]].getLogger + given logger: Logger[IO] = LoggerFactory[IO].getLogger override def count(query: Query): IO[CountOutput] = esClient @@ -68,8 +66,3 @@ object SearchServiceImpl: case _: Query.Game => Index.Game case _: Query.Study => Index.Study case _: Query.Team => Index.Team - - import smithy4s.json.Json.given - import com.github.plokhotnyuk.jsoniter_scala.core.* - - given [A: Schema]: Indexable[A] = (a: A) => writeToString(a) diff --git a/modules/ingestor/src/main/scala/ingestor.game.scala b/modules/ingestor/src/main/scala/ingestor.game.scala index 8a5687dd..9e90f8dc 100644 --- a/modules/ingestor/src/main/scala/ingestor.game.scala +++ b/modules/ingestor/src/main/scala/ingestor.game.scala @@ -28,7 +28,7 @@ object GameIngestor: def watch: IO[Unit] = fs2.Stream - .eval(store.startAt(index, config.startAt).flatTap(since => info"Starting game ingestor from $since")) + .eval(startAt) .flatMap(games.watch) .evalMap: result => updateElastic(result, false) *> store.saveLastIndexedTimestamp(index, result.timestamp) @@ -57,3 +57,8 @@ object GameIngestor: storeBulk(index, result.toIndex) *> deleteMany(index, result.toDelete) ) + + private def startAt: IO[Option[Instant]] = + config.startAt + .fold(store.get(index.value))(_.some.pure[IO]) + .flatTap(since => info"Starting forum ingestor from $since") diff --git a/modules/ingestor/src/main/scala/ingestor.team.scala b/modules/ingestor/src/main/scala/ingestor.team.scala index 33502c17..8e6d81f4 100644 --- a/modules/ingestor/src/main/scala/ingestor.team.scala +++ b/modules/ingestor/src/main/scala/ingestor.team.scala @@ -22,7 +22,7 @@ object TeamIngestor: LoggerFactory[IO], ESClient[IO] ): TeamIngestor = new: - given Logger[IO] = summon[LoggerFactory[IO]].getLogger + given Logger[IO] = LoggerFactory[IO].getLogger def watch = fs2.Stream .eval(startAt) diff --git a/modules/ingestor/src/main/scala/mongo.game.scala b/modules/ingestor/src/main/scala/mongo.game.scala index 9e33313f..88ef6a6c 100644 --- a/modules/ingestor/src/main/scala/mongo.game.scala +++ b/modules/ingestor/src/main/scala/mongo.game.scala @@ -70,7 +70,7 @@ object Games: Aggregate.matchBy(eventFilter.and(changeFilter)).combinedWith(Aggregate.project(eventProjection)) def apply(mongo: MongoDatabase[IO], config: IngestorConfig.Game)(using LoggerFactory[IO]): IO[Games] = - given Logger[IO] = summon[LoggerFactory[IO]].getLogger + given Logger[IO] = LoggerFactory[IO].getLogger mongo.getCollectionWithCodec[DbGame]("game5").map(apply(config)) def apply(config: IngestorConfig.Game)(games: MongoCollection[IO, DbGame])(using Logger[IO]): Games = new: diff --git a/modules/ingestor/src/main/scala/mongo.study.scala b/modules/ingestor/src/main/scala/mongo.study.scala index 239a9e12..b8c5ec54 100644 --- a/modules/ingestor/src/main/scala/mongo.study.scala +++ b/modules/ingestor/src/main/scala/mongo.study.scala @@ -32,7 +32,7 @@ object Studies: local: MongoDatabase[IO], config: IngestorConfig.Study )(using LoggerFactory[IO]): IO[Studies] = - given Logger[IO] = summon[LoggerFactory[IO]].getLogger + given Logger[IO] = LoggerFactory[IO].getLogger (study.getCollection("study"), ChapterRepo(study), local.getCollection("oplog.rs")) .mapN(apply(config)) diff --git a/modules/ingestor/src/main/scala/mongo.team.scala b/modules/ingestor/src/main/scala/mongo.team.scala index fc8d59cb..bf663f0f 100644 --- a/modules/ingestor/src/main/scala/mongo.team.scala +++ b/modules/ingestor/src/main/scala/mongo.team.scala @@ -40,7 +40,7 @@ object Teams: def apply(mongo: MongoDatabase[IO], config: IngestorConfig.Team)(using LoggerFactory[IO] ): IO[Teams] = - given Logger[IO] = summon[LoggerFactory[IO]].getLogger + given Logger[IO] = LoggerFactory[IO].getLogger mongo.getCollection("team").map(apply(config)) def apply(config: IngestorConfig.Team)(teams: MongoCollection)(using Logger[IO]): Teams = new: From bec124acf492f33049ad019d7b1e7eeaf90ec9c3 Mon Sep 17 00:00:00 2001 From: Thanh Le Date: Wed, 27 Nov 2024 19:53:21 +0100 Subject: [PATCH 11/18] Use generic Ingestor --- modules/ingestor/src/main/scala/Repo.scala | 17 ++++ modules/ingestor/src/main/scala/app.scala | 2 +- modules/ingestor/src/main/scala/cli.scala | 6 +- .../src/main/scala/ingestor.forum.scala | 54 ------------ .../src/main/scala/ingestor.game.scala | 64 -------------- .../ingestor/src/main/scala/ingestor.scala | 83 ++++++++++++------- .../src/main/scala/ingestor.study.scala | 52 ------------ .../src/main/scala/ingestor.team.scala | 52 ------------ .../ingestor/src/main/scala/ingestors.scala | 38 +++++++++ .../ingestor/src/main/scala/mongo.forum.scala | 19 ++--- .../ingestor/src/main/scala/mongo.game.scala | 21 ++--- .../ingestor/src/main/scala/mongo.study.scala | 17 ++-- .../ingestor/src/main/scala/mongo.team.scala | 12 +-- 13 files changed, 136 insertions(+), 301 deletions(-) create mode 100644 modules/ingestor/src/main/scala/Repo.scala delete mode 100644 modules/ingestor/src/main/scala/ingestor.forum.scala delete mode 100644 modules/ingestor/src/main/scala/ingestor.game.scala delete mode 100644 modules/ingestor/src/main/scala/ingestor.study.scala delete mode 100644 modules/ingestor/src/main/scala/ingestor.team.scala create mode 100644 modules/ingestor/src/main/scala/ingestors.scala diff --git a/modules/ingestor/src/main/scala/Repo.scala b/modules/ingestor/src/main/scala/Repo.scala new file mode 100644 index 00000000..117e381c --- /dev/null +++ b/modules/ingestor/src/main/scala/Repo.scala @@ -0,0 +1,17 @@ +package lila.search +package ingestor + +import cats.effect.* +import cats.syntax.all.* + +import java.time.Instant + +import Repo.Result + +trait Repo[A]: + def watch(since: Option[Instant]): fs2.Stream[IO, Result[A]] + def fetch(since: Instant, until: Instant): fs2.Stream[IO, Result[A]] + +object Repo: + type SourceWithId[A] = (String, A) + case class Result[A](toIndex: List[SourceWithId[A]], toDelete: List[Id], timestamp: Option[Instant]) diff --git a/modules/ingestor/src/main/scala/app.scala b/modules/ingestor/src/main/scala/app.scala index 3a84ae20..3d8f3599 100644 --- a/modules/ingestor/src/main/scala/app.scala +++ b/modules/ingestor/src/main/scala/app.scala @@ -34,7 +34,7 @@ object App extends IOApp.Simple: class IngestorApp(res: AppResources, config: AppConfig)(using Logger[IO], LoggerFactory[IO]): def run(): Resource[IO, Unit] = given ESClient[IO] = res.elastic - Ingestor(res.lichess, res.study, res.studyLocal, res.store, config.ingestor) + Ingestors(res.lichess, res.study, res.studyLocal, res.store, config.ingestor) .flatMap(_.run()) .toResource .evalTap(_ => Logger[IO].info("Ingestor started")) diff --git a/modules/ingestor/src/main/scala/cli.scala b/modules/ingestor/src/main/scala/cli.scala index a0fab8c3..b29eb414 100644 --- a/modules/ingestor/src/main/scala/cli.scala +++ b/modules/ingestor/src/main/scala/cli.scala @@ -28,15 +28,15 @@ object cli opts.parse.map: opts => makeIngestor.use(_.execute(opts).as(ExitCode.Success)) - def makeIngestor: Resource[IO, Ingestor] = + def makeIngestor: Resource[IO, Ingestors] = for config <- AppConfig.load.toResource res <- AppResources.instance(config) given ESClient[IO] = res.elastic - ingestor <- Ingestor(res.lichess, res.study, res.studyLocal, res.store, config.ingestor).toResource + ingestor <- Ingestors(res.lichess, res.study, res.studyLocal, res.store, config.ingestor).toResource yield ingestor - extension (ingestor: Ingestor) + extension (ingestor: Ingestors) def execute(opts: IndexOpts | WatchOpts): IO[Unit] = opts match case opts: IndexOpts => index(opts) diff --git a/modules/ingestor/src/main/scala/ingestor.forum.scala b/modules/ingestor/src/main/scala/ingestor.forum.scala deleted file mode 100644 index 91a861a5..00000000 --- a/modules/ingestor/src/main/scala/ingestor.forum.scala +++ /dev/null @@ -1,54 +0,0 @@ -package lila.search -package ingestor - -import cats.effect.IO -import cats.syntax.all.* -import org.typelevel.log4cats.syntax.* -import org.typelevel.log4cats.{ Logger, LoggerFactory } - -import java.time.Instant - -trait ForumIngestor: - // watch change events from MongoDB and ingest forum posts into elastic search - def watch: IO[Unit] - // Fetch posts in [since, until] and ingest into elastic search - def run(since: Instant, until: Instant, dryRun: Boolean): IO[Unit] - -object ForumIngestor: - - private val index = Index.Forum - - def apply(forums: Forums, store: KVStore, config: IngestorConfig.Forum)(using - LoggerFactory[IO], - ESClient[IO] - ): ForumIngestor = new: - - given Logger[IO] = LoggerFactory[IO].getLogger - - def watch: IO[Unit] = - fs2.Stream - .eval(startAt) - .flatMap(forums.watch) - .evalMap: result => - storeBulk(index, result.toIndex) - *> deleteMany(index, result.toDelete) - *> store.saveLastIndexedTimestamp(index, result.timestamp) - .compile - .drain - - def run(since: Instant, until: Instant, dryRun: Boolean): IO[Unit] = - forums - .fetch(since, until) - .evalMap: result => - dryRun.fold( - result.toIndex.traverse_(doc => debug"Would index $doc") - *> result.toDelete.traverse_(doc => debug"Would delete $doc"), - storeBulk(index, result.toIndex) *> deleteMany(index, result.toDelete) - ) - .compile - .drain - - private def startAt: IO[Option[Instant]] = - config.startAt - .fold(store.get(index.value))(_.some.pure[IO]) - .flatTap(since => info"Starting forum ingestor from $since") diff --git a/modules/ingestor/src/main/scala/ingestor.game.scala b/modules/ingestor/src/main/scala/ingestor.game.scala deleted file mode 100644 index 9e90f8dc..00000000 --- a/modules/ingestor/src/main/scala/ingestor.game.scala +++ /dev/null @@ -1,64 +0,0 @@ -package lila.search -package ingestor - -import cats.effect.* -import cats.syntax.all.* -import org.typelevel.log4cats.syntax.* -import org.typelevel.log4cats.{ Logger, LoggerFactory } - -import java.time.Instant - -trait GameIngestor: - // watch change events from game5 collection and ingest games into elastic search - def watch: IO[Unit] - // Similar to watch but started from a given timestamp - def watch(since: Option[Instant], dryRun: Boolean): IO[Unit] - // Fetch posts in [since, until] and ingest into elastic search - def run(since: Instant, until: Instant, dryRun: Boolean): IO[Unit] - -object GameIngestor: - - private val index = Index.Game - - def apply(games: Games, store: KVStore, config: IngestorConfig.Game)(using - LoggerFactory[IO], - ESClient[IO] - ): GameIngestor = new: - given Logger[IO] = LoggerFactory[IO].getLogger - - def watch: IO[Unit] = - fs2.Stream - .eval(startAt) - .flatMap(games.watch) - .evalMap: result => - updateElastic(result, false) *> store.saveLastIndexedTimestamp(index, result.timestamp) - .compile - .drain - - def watch(since: Option[Instant], dryRun: Boolean): IO[Unit] = - games - .watch(since) - .evalMap(updateElastic(_, dryRun)) - .compile - .drain - - def run(since: Instant, until: Instant, dryRun: Boolean): IO[Unit] = - games - .fetch(since, until) - .evalMap(updateElastic(_, dryRun)) - .compile - .drain - - private def updateElastic(result: Games.Result, dryRun: Boolean): IO[Unit] = - dryRun.fold( - info"Would index total ${result.toIndex.size} games and delete ${result.toDelete.size} games" *> - result.toIndex.traverse_(x => debug"Would index $x") - *> result.toDelete.traverse_(x => debug"Would delete $x"), - storeBulk(index, result.toIndex) - *> deleteMany(index, result.toDelete) - ) - - private def startAt: IO[Option[Instant]] = - config.startAt - .fold(store.get(index.value))(_.some.pure[IO]) - .flatTap(since => info"Starting forum ingestor from $since") diff --git a/modules/ingestor/src/main/scala/ingestor.scala b/modules/ingestor/src/main/scala/ingestor.scala index 335a0b71..1e9ae833 100644 --- a/modules/ingestor/src/main/scala/ingestor.scala +++ b/modules/ingestor/src/main/scala/ingestor.scala @@ -3,36 +3,61 @@ package ingestor import cats.effect.* import cats.syntax.all.* -import mongo4cats.database.MongoDatabase -import org.typelevel.log4cats.LoggerFactory - -class Ingestor( - val forum: ForumIngestor, - val study: StudyIngestor, - val game: GameIngestor, - val team: TeamIngestor -): - def run(): IO[Unit] = - List(forum.watch, team.watch, study.watch, game.watch).parSequence_ +import org.typelevel.log4cats.syntax.* +import org.typelevel.log4cats.{ Logger, LoggerFactory } +import smithy4s.schema.Schema + +import java.time.Instant + +trait Ingestor: + // watch change events from database and ingest documents into elastic search + def watch: IO[Unit] + // Similar to watch but started from a given timestamp + def watch(since: Option[Instant], dryRun: Boolean): IO[Unit] + // Fetch documents in [since, until] and ingest into elastic search + def run(since: Instant, until: Instant, dryRun: Boolean): IO[Unit] object Ingestor: - def apply( - lichess: MongoDatabase[IO], - study: MongoDatabase[IO], - local: MongoDatabase[IO], - store: KVStore, - config: IngestorConfig - )(using LoggerFactory[IO], ESClient[IO]): IO[Ingestor] = - ( - Forums(lichess, config.forum), - Studies(study, local, config.study), - Games(lichess, config.game), - Teams(lichess, config.team) - ).mapN: (forums, studies, games, teams) => - new Ingestor( - ForumIngestor(forums, store, config.forum), - StudyIngestor(studies, store, config.study), - GameIngestor(games, store, config.game), - TeamIngestor(teams, store, config.team) + def apply[A: Schema](index: Index, games: Repo[A], store: KVStore, defaultStartAt: Option[Instant])(using + LoggerFactory[IO], + ESClient[IO] + ): Ingestor = new: + given Logger[IO] = LoggerFactory[IO].getLogger + + def watch: IO[Unit] = + fs2.Stream + .eval(startAt) + .flatMap(games.watch) + .evalMap: result => + updateElastic(result, false) *> store.saveLastIndexedTimestamp(index, result.timestamp) + .compile + .drain + + def watch(since: Option[Instant], dryRun: Boolean): IO[Unit] = + games + .watch(since) + .evalMap(updateElastic(_, dryRun)) + .compile + .drain + + def run(since: Instant, until: Instant, dryRun: Boolean): IO[Unit] = + games + .fetch(since, until) + .evalMap(updateElastic(_, dryRun)) + .compile + .drain + + private def updateElastic(result: Repo.Result[A], dryRun: Boolean): IO[Unit] = + dryRun.fold( + info"Would index total ${result.toIndex.size} games and delete ${result.toDelete.size} games" *> + result.toIndex.traverse_(x => debug"Would index $x") + *> result.toDelete.traverse_(x => debug"Would delete $x"), + storeBulk(index, result.toIndex) + *> deleteMany(index, result.toDelete) ) + + private def startAt: IO[Option[Instant]] = + defaultStartAt + .fold(store.get(index.value))(_.some.pure[IO]) + .flatTap(since => info"Starting ${index.value} ingestor from $since") diff --git a/modules/ingestor/src/main/scala/ingestor.study.scala b/modules/ingestor/src/main/scala/ingestor.study.scala deleted file mode 100644 index 08e1ca87..00000000 --- a/modules/ingestor/src/main/scala/ingestor.study.scala +++ /dev/null @@ -1,52 +0,0 @@ -package lila.search -package ingestor - -import cats.effect.IO -import cats.syntax.all.* -import org.typelevel.log4cats.syntax.* -import org.typelevel.log4cats.{ Logger, LoggerFactory } - -import java.time.Instant - -trait StudyIngestor: - // pull changes from study MongoDB and ingest into elastic search - def watch: IO[Unit] - def run(since: Instant, until: Instant, dryRun: Boolean): IO[Unit] - -object StudyIngestor: - - private val index = Index.Study - - def apply( - studies: Studies, - store: KVStore, - config: IngestorConfig.Study - )(using LoggerFactory[IO], ESClient[IO]): StudyIngestor = new: - given Logger[IO] = LoggerFactory[IO].getLogger - def watch: IO[Unit] = - fs2.Stream - .eval(startAt) - .flatMap(studies.watch) - .evalMap: result => - storeBulk(index, result.toIndex) - *> deleteMany(index, result.toDelete) - *> store.saveLastIndexedTimestamp(index, result.timestamp) - .compile - .drain - - def run(since: Instant, until: Instant, dryRun: Boolean): IO[Unit] = - studies - .fetch(since, until) - .evalMap: result => - dryRun.fold( - result.toIndex.traverse_(doc => debug"Would index $doc") - *> result.toDelete.traverse_(doc => debug"Would delete $doc"), - storeBulk(index, result.toIndex) *> deleteMany(index, result.toDelete) - ) - .compile - .drain - - private def startAt: IO[Option[Instant]] = - config.startAt - .fold(store.get(index.value))(_.some.pure[IO]) - .flatTap(since => info"Starting forum ingestor from $since") diff --git a/modules/ingestor/src/main/scala/ingestor.team.scala b/modules/ingestor/src/main/scala/ingestor.team.scala deleted file mode 100644 index 8e6d81f4..00000000 --- a/modules/ingestor/src/main/scala/ingestor.team.scala +++ /dev/null @@ -1,52 +0,0 @@ -package lila.search -package ingestor - -import cats.effect.IO -import cats.syntax.all.* -import org.typelevel.log4cats.syntax.* -import org.typelevel.log4cats.{ Logger, LoggerFactory } - -import java.time.Instant - -trait TeamIngestor: - // watch change events from MongoDB and ingest team data into elastic search - def watch: IO[Unit] - // Fetch teams in [since, until] and ingest into elastic search - def run(since: Instant, until: Instant, dryRun: Boolean): IO[Unit] - -object TeamIngestor: - - private val index = Index.Team - - def apply(teams: Teams, store: KVStore, config: IngestorConfig.Team)(using - LoggerFactory[IO], - ESClient[IO] - ): TeamIngestor = new: - given Logger[IO] = LoggerFactory[IO].getLogger - def watch = - fs2.Stream - .eval(startAt) - .flatMap(teams.watch) - .evalMap: result => - storeBulk(index, result.toIndex) - *> deleteMany(index, result.toDelete) - *> store.saveLastIndexedTimestamp(index, result.timestamp) - .compile - .drain - - def run(since: Instant, until: Instant, dryRun: Boolean) = - teams - .fetch(since, until) - .evalMap: result => - dryRun.fold( - result.toIndex.traverse_(doc => debug"Would index $doc") - *> result.toDelete.traverse_(doc => debug"Would delete $doc"), - storeBulk(index, result.toIndex) *> deleteMany(index, result.toDelete) - ) - .compile - .drain - - private def startAt: IO[Option[Instant]] = - config.startAt - .fold(store.get(index.value))(_.some.pure[IO]) - .flatTap(since => info"Starting team ingestor from $since") diff --git a/modules/ingestor/src/main/scala/ingestors.scala b/modules/ingestor/src/main/scala/ingestors.scala new file mode 100644 index 00000000..20d2e78d --- /dev/null +++ b/modules/ingestor/src/main/scala/ingestors.scala @@ -0,0 +1,38 @@ +package lila.search +package ingestor + +import cats.effect.* +import cats.syntax.all.* +import mongo4cats.database.MongoDatabase +import org.typelevel.log4cats.LoggerFactory + +class Ingestors( + val forum: Ingestor, + val study: Ingestor, + val game: Ingestor, + val team: Ingestor +): + def run(): IO[Unit] = + List(forum.watch, team.watch, study.watch, game.watch).parSequence_ + +object Ingestors: + + def apply( + lichess: MongoDatabase[IO], + study: MongoDatabase[IO], + local: MongoDatabase[IO], + store: KVStore, + config: IngestorConfig + )(using LoggerFactory[IO], ESClient[IO]): IO[Ingestors] = + ( + Forums(lichess, config.forum), + Studies(study, local, config.study), + Games(lichess, config.game), + Teams(lichess, config.team) + ).mapN: (forums, studies, games, teams) => + new Ingestors( + Ingestor(Index.Forum, forums, store, config.forum.startAt), + Ingestor(Index.Study, studies, store, config.study.startAt), + Ingestor(Index.Game, games, store, config.game.startAt), + Ingestor(Index.Team, teams, store, config.team.startAt) + ) diff --git a/modules/ingestor/src/main/scala/mongo.forum.scala b/modules/ingestor/src/main/scala/mongo.forum.scala index 60b896fa..c1c50488 100644 --- a/modules/ingestor/src/main/scala/mongo.forum.scala +++ b/modules/ingestor/src/main/scala/mongo.forum.scala @@ -15,17 +15,10 @@ import org.typelevel.log4cats.{ Logger, LoggerFactory } import java.time.Instant import scala.concurrent.duration.* -import Forums.Result - -trait Forums: - def watch(since: Option[Instant]): fs2.Stream[IO, Result] - def fetch(since: Instant, until: Instant): fs2.Stream[IO, Result] +import Repo.{ Result, SourceWithId } object Forums: - private type SourceWithId = (String, ForumSource) - case class Result(toIndex: List[(String, ForumSource)], toDelete: List[Id], timestamp: Option[Instant]) - private val interestedOperations = List(DELETE, INSERT, REPLACE, UPDATE).map(_.getValue) private def maxPostSizeFilter(max: Int) = @@ -46,14 +39,14 @@ object Forums: def apply(mongo: MongoDatabase[IO], config: IngestorConfig.Forum)(using LoggerFactory[IO] - ): IO[Forums] = + ): IO[Repo[ForumSource]] = given Logger[IO] = LoggerFactory[IO].getLogger (mongo.getCollection("f_topic"), mongo.getCollection("f_post")).mapN(apply(config)) def apply(config: IngestorConfig.Forum)( topics: MongoCollection, posts: MongoCollection - )(using Logger[IO]): Forums = new: + )(using Logger[IO]): Repo[ForumSource] = new: def fetch(since: Instant, until: Instant) = val filter = range(F.createdAt)(since, until.some) @@ -73,7 +66,7 @@ object Forums: .map: sources => Result(sources, toDelete.flatten(_.id.map(Id.apply)), none) - def watch(since: Option[Instant]): fs2.Stream[IO, Result] = + def watch(since: Option[Instant]): fs2.Stream[IO, Result[ForumSource]] = val builder = posts.watch(aggregate(config.maxPostLength)) // skip the first event if we're starting from a specific timestamp // since the event at that timestamp is already indexed @@ -105,7 +98,7 @@ object Forums: .map(_.map(doc => (doc.id, doc.getString(Topic.name)).mapN(_ -> _)).flatten.toMap) extension (events: List[Document]) - private def toSources: IO[List[SourceWithId]] = + private def toSources: IO[List[SourceWithId[ForumSource]]] = val topicIds = events.flatMap(_.topicId).distinct topicIds.isEmpty.fold( info"no topics found for posts: $events".as(Nil), @@ -118,7 +111,7 @@ object Forums: extension (doc: Document) - private def toSource(topicMap: Map[String, String]): IO[Option[SourceWithId]] = + private def toSource(topicMap: Map[String, String]): IO[Option[SourceWithId[ForumSource]]] = (doc.id, doc.topicId) .flatMapN: (id, topicId) => doc.toSource(topicMap.get(topicId), topicId).map(id -> _) diff --git a/modules/ingestor/src/main/scala/mongo.game.scala b/modules/ingestor/src/main/scala/mongo.game.scala index 88ef6a6c..f8173ef4 100644 --- a/modules/ingestor/src/main/scala/mongo.game.scala +++ b/modules/ingestor/src/main/scala/mongo.game.scala @@ -20,17 +20,10 @@ import org.typelevel.log4cats.{ Logger, LoggerFactory } import java.time.Instant import scala.concurrent.duration.* -import Games.Result - -trait Games: - def watch(since: Option[Instant]): fs2.Stream[IO, Result] - def fetch(since: Instant, until: Instant): fs2.Stream[IO, Result] +import Repo.Result object Games: - private type SourceWithId = (String, GameSource) - case class Result(toIndex: List[SourceWithId], toDelete: List[Id], timestamp: Option[Instant]) - private val interestedOperations = List(UPDATE, DELETE).map(_.getValue) private val eventFilter = Filter.in("operationType", interestedOperations) @@ -69,13 +62,17 @@ object Games: private val aggregate = Aggregate.matchBy(eventFilter.and(changeFilter)).combinedWith(Aggregate.project(eventProjection)) - def apply(mongo: MongoDatabase[IO], config: IngestorConfig.Game)(using LoggerFactory[IO]): IO[Games] = + def apply(mongo: MongoDatabase[IO], config: IngestorConfig.Game)(using + LoggerFactory[IO] + ): IO[Repo[GameSource]] = given Logger[IO] = LoggerFactory[IO].getLogger mongo.getCollectionWithCodec[DbGame]("game5").map(apply(config)) - def apply(config: IngestorConfig.Game)(games: MongoCollection[IO, DbGame])(using Logger[IO]): Games = new: + def apply(config: IngestorConfig.Game)(games: MongoCollection[IO, DbGame])(using + Logger[IO] + ): Repo[GameSource] = new: - def watch(since: Option[Instant]): fs2.Stream[IO, Result] = + def watch(since: Option[Instant]): fs2.Stream[IO, Result[GameSource]] = changes(since) .map: events => val lastEventTimestamp = events.lastOption.flatMap(_.clusterTime).flatMap(_.asInstant) @@ -86,7 +83,7 @@ object Games: lastEventTimestamp ) - def fetch(since: Instant, until: Instant): fs2.Stream[IO, Result] = + def fetch(since: Instant, until: Instant): fs2.Stream[IO, Result[GameSource]] = val filter = range(F.createdAt)(since, until.some) .or(range(F.updatedAt)(since, until.some)) games diff --git a/modules/ingestor/src/main/scala/mongo.study.scala b/modules/ingestor/src/main/scala/mongo.study.scala index b8c5ec54..7a2bac2e 100644 --- a/modules/ingestor/src/main/scala/mongo.study.scala +++ b/modules/ingestor/src/main/scala/mongo.study.scala @@ -11,17 +11,10 @@ import org.typelevel.log4cats.{ Logger, LoggerFactory } import java.time.Instant -import Studies.Result - -trait Studies: - def watch(since: Option[Instant]): fs2.Stream[IO, Result] - def fetch(since: Instant, until: Instant): fs2.Stream[IO, Result] +import Repo.Result object Studies: - private type SourceWithId = (String, StudySource) - case class Result(toIndex: List[SourceWithId], toDelete: List[Id], timestamp: Option[Instant]) - private val interestedfields = List("_id", F.name, F.members, F.ownerId, F.visibility, F.topics, F.likes) private val indexDocProjection = Projection.include(interestedfields) @@ -31,7 +24,7 @@ object Studies: study: MongoDatabase[IO], local: MongoDatabase[IO], config: IngestorConfig.Study - )(using LoggerFactory[IO]): IO[Studies] = + )(using LoggerFactory[IO]): IO[Repo[StudySource]] = given Logger[IO] = LoggerFactory[IO].getLogger (study.getCollection("study"), ChapterRepo(study), local.getCollection("oplog.rs")) .mapN(apply(config)) @@ -40,14 +33,14 @@ object Studies: studies: MongoCollection, chapters: ChapterRepo, oplogs: MongoCollection - )(using Logger[IO]): Studies = new: + )(using Logger[IO]): Repo[StudySource] = new: - def watch(since: Option[Instant]): fs2.Stream[IO, Result] = + def watch(since: Option[Instant]): fs2.Stream[IO, Result[StudySource]] = intervalStream(since) .meteredStartImmediately(config.interval) .flatMap(fetch) - def fetch(since: Instant, until: Instant): fs2.Stream[IO, Result] = + def fetch(since: Instant, until: Instant): fs2.Stream[IO, Result[StudySource]] = // fs2.Stream.eval(info"Indexing studies from $since to $until") ++ // fs2.Stream.eval(info"deleting studies from $since to $until") ++ pullAndIndex(since, until) diff --git a/modules/ingestor/src/main/scala/mongo.team.scala b/modules/ingestor/src/main/scala/mongo.team.scala index bf663f0f..b68e5fbb 100644 --- a/modules/ingestor/src/main/scala/mongo.team.scala +++ b/modules/ingestor/src/main/scala/mongo.team.scala @@ -15,16 +15,10 @@ import org.typelevel.log4cats.{ Logger, LoggerFactory } import java.time.Instant import scala.concurrent.duration.* -import Teams.Result -trait Teams: - def watch(since: Option[Instant]): fs2.Stream[IO, Result] - def fetch(since: Instant, until: Instant): fs2.Stream[IO, Result] +import Repo.Result object Teams: - type SourceWithId = (String, TeamSource) - case class Result(toIndex: List[SourceWithId], toDelete: List[Id], timestamp: Option[Instant]) - private val interestedOperations = List(DELETE, INSERT, UPDATE, REPLACE).map(_.getValue) private val eventFilter = Filter.in("operationType", interestedOperations) @@ -39,11 +33,11 @@ object Teams: def apply(mongo: MongoDatabase[IO], config: IngestorConfig.Team)(using LoggerFactory[IO] - ): IO[Teams] = + ): IO[Repo[TeamSource]] = given Logger[IO] = LoggerFactory[IO].getLogger mongo.getCollection("team").map(apply(config)) - def apply(config: IngestorConfig.Team)(teams: MongoCollection)(using Logger[IO]): Teams = new: + def apply(config: IngestorConfig.Team)(teams: MongoCollection)(using Logger[IO]): Repo[TeamSource] = new: def watch(since: Option[Instant]) = // skip the first event if we're starting from a specific timestamp From b649e7789287415c52f95d03ce0f1310127e3baf Mon Sep 17 00:00:00 2001 From: Thanh Le Date: Wed, 27 Nov 2024 19:58:15 +0100 Subject: [PATCH 12/18] cli: Implement watch for all target --- .../e2e/src/test/scala/IntegrationSuite.scala | 2 +- modules/ingestor/src/main/scala/cli.scala | 21 ++++++++++++------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/modules/e2e/src/test/scala/IntegrationSuite.scala b/modules/e2e/src/test/scala/IntegrationSuite.scala index 5957f328..18a764da 100644 --- a/modules/e2e/src/test/scala/IntegrationSuite.scala +++ b/modules/e2e/src/test/scala/IntegrationSuite.scala @@ -41,7 +41,7 @@ object IntegrationSuite extends IOSuite: def testAppConfig(elastic: ElasticConfig) = AppConfig( server = - HttpServerConfig(ip"0.0.0.0", port"9999", apiLogger = false, shutdownTimeout = 30, enableDocs = false), + HttpServerConfig(ip"0.0.0.0", port"9999", apiLogger = false, shutdownTimeout = 1, enableDocs = false), elastic = elastic ) diff --git a/modules/ingestor/src/main/scala/cli.scala b/modules/ingestor/src/main/scala/cli.scala index b29eb414..9279c0b9 100644 --- a/modules/ingestor/src/main/scala/cli.scala +++ b/modules/ingestor/src/main/scala/cli.scala @@ -62,11 +62,21 @@ object cli opts.index match case Index.Game => ingestor.game.watch(opts.since.some, opts.dry) - case _ => IO.println("We only support game watch for now") + case Index.Forum => + ingestor.forum.watch(opts.since.some, opts.dry) + case Index.Team => + ingestor.team.watch(opts.since.some, opts.dry) + case Index.Study => + ingestor.study.watch(opts.since.some, opts.dry) + case _ => + ingestor.forum.watch(opts.since.some, opts.dry) *> + ingestor.team.watch(opts.since.some, opts.dry) *> + ingestor.study.watch(opts.since.some, opts.dry) *> + ingestor.game.watch(opts.since.some, opts.dry) object opts: case class IndexOpts(index: Index | Unit, since: Instant, until: Instant, dry: Boolean) - case class WatchOpts(index: Index, since: Instant, dry: Boolean) + case class WatchOpts(index: Index | Unit, since: Instant, dry: Boolean) def parse = Opts.subcommand("index", "index documents")(indexOpt) <+> Opts.subcommand("watch", "watch change events and index documents")(watchOpt) @@ -115,12 +125,7 @@ object opts: ) val watchOpt = ( - Opts.option[Index]( - long = "index", - help = "Target index (only `game` for now)", - short = "i", - metavar = "forum|team|study|game" - ), + singleIndexOpt orElse allIndexOpt, Opts .option[Instant]( long = "since", From f1e5d592296ca102fd9777663b6d2f05b5c66281 Mon Sep 17 00:00:00 2001 From: Thanh Le Date: Wed, 27 Nov 2024 20:13:39 +0100 Subject: [PATCH 13/18] Rename repo object to have repo postfix --- modules/ingestor/src/main/scala/ingestors.scala | 8 ++++---- modules/ingestor/src/main/scala/mongo.forum.scala | 2 +- modules/ingestor/src/main/scala/mongo.game.scala | 2 +- modules/ingestor/src/main/scala/mongo.study.scala | 2 +- modules/ingestor/src/main/scala/mongo.team.scala | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/modules/ingestor/src/main/scala/ingestors.scala b/modules/ingestor/src/main/scala/ingestors.scala index 20d2e78d..29680f94 100644 --- a/modules/ingestor/src/main/scala/ingestors.scala +++ b/modules/ingestor/src/main/scala/ingestors.scala @@ -25,10 +25,10 @@ object Ingestors: config: IngestorConfig )(using LoggerFactory[IO], ESClient[IO]): IO[Ingestors] = ( - Forums(lichess, config.forum), - Studies(study, local, config.study), - Games(lichess, config.game), - Teams(lichess, config.team) + ForumRepo(lichess, config.forum), + StudyRepo(study, local, config.study), + GameRepo(lichess, config.game), + TeamRepo(lichess, config.team) ).mapN: (forums, studies, games, teams) => new Ingestors( Ingestor(Index.Forum, forums, store, config.forum.startAt), diff --git a/modules/ingestor/src/main/scala/mongo.forum.scala b/modules/ingestor/src/main/scala/mongo.forum.scala index c1c50488..f18c0600 100644 --- a/modules/ingestor/src/main/scala/mongo.forum.scala +++ b/modules/ingestor/src/main/scala/mongo.forum.scala @@ -17,7 +17,7 @@ import scala.concurrent.duration.* import Repo.{ Result, SourceWithId } -object Forums: +object ForumRepo: private val interestedOperations = List(DELETE, INSERT, REPLACE, UPDATE).map(_.getValue) diff --git a/modules/ingestor/src/main/scala/mongo.game.scala b/modules/ingestor/src/main/scala/mongo.game.scala index f8173ef4..a0b5e3d4 100644 --- a/modules/ingestor/src/main/scala/mongo.game.scala +++ b/modules/ingestor/src/main/scala/mongo.game.scala @@ -22,7 +22,7 @@ import scala.concurrent.duration.* import Repo.Result -object Games: +object GameRepo: private val interestedOperations = List(UPDATE, DELETE).map(_.getValue) private val eventFilter = Filter.in("operationType", interestedOperations) diff --git a/modules/ingestor/src/main/scala/mongo.study.scala b/modules/ingestor/src/main/scala/mongo.study.scala index 7a2bac2e..7313fdd2 100644 --- a/modules/ingestor/src/main/scala/mongo.study.scala +++ b/modules/ingestor/src/main/scala/mongo.study.scala @@ -13,7 +13,7 @@ import java.time.Instant import Repo.Result -object Studies: +object StudyRepo: private val interestedfields = List("_id", F.name, F.members, F.ownerId, F.visibility, F.topics, F.likes) diff --git a/modules/ingestor/src/main/scala/mongo.team.scala b/modules/ingestor/src/main/scala/mongo.team.scala index b68e5fbb..4565f62b 100644 --- a/modules/ingestor/src/main/scala/mongo.team.scala +++ b/modules/ingestor/src/main/scala/mongo.team.scala @@ -17,7 +17,7 @@ import scala.concurrent.duration.* import Repo.Result -object Teams: +object TeamRepo: private val interestedOperations = List(DELETE, INSERT, UPDATE, REPLACE).map(_.getValue) private val eventFilter = Filter.in("operationType", interestedOperations) From 14877375f508c9499373574d3b6a0d0226664d94 Mon Sep 17 00:00:00 2001 From: Thanh Le Date: Wed, 27 Nov 2024 20:28:36 +0100 Subject: [PATCH 14/18] Rename games => repo --- modules/ingestor/src/main/scala/ingestor.scala | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/ingestor/src/main/scala/ingestor.scala b/modules/ingestor/src/main/scala/ingestor.scala index 1e9ae833..713467c1 100644 --- a/modules/ingestor/src/main/scala/ingestor.scala +++ b/modules/ingestor/src/main/scala/ingestor.scala @@ -19,7 +19,7 @@ trait Ingestor: object Ingestor: - def apply[A: Schema](index: Index, games: Repo[A], store: KVStore, defaultStartAt: Option[Instant])(using + def apply[A: Schema](index: Index, repo: Repo[A], store: KVStore, defaultStartAt: Option[Instant])(using LoggerFactory[IO], ESClient[IO] ): Ingestor = new: @@ -28,21 +28,21 @@ object Ingestor: def watch: IO[Unit] = fs2.Stream .eval(startAt) - .flatMap(games.watch) + .flatMap(repo.watch) .evalMap: result => updateElastic(result, false) *> store.saveLastIndexedTimestamp(index, result.timestamp) .compile .drain def watch(since: Option[Instant], dryRun: Boolean): IO[Unit] = - games + repo .watch(since) .evalMap(updateElastic(_, dryRun)) .compile .drain def run(since: Instant, until: Instant, dryRun: Boolean): IO[Unit] = - games + repo .fetch(since, until) .evalMap(updateElastic(_, dryRun)) .compile From 4f8be0c096e30fb6704cb044a6551fb0706f6455 Mon Sep 17 00:00:00 2001 From: Thanh Le Date: Wed, 27 Nov 2024 20:43:06 +0100 Subject: [PATCH 15/18] Move some functions out of package --- modules/ingestor/src/main/scala/app.scala | 3 +- modules/ingestor/src/main/scala/cli.scala | 10 ++++-- .../ingestor/src/main/scala/ingestor.scala | 35 ++++++++++++++++--- .../ingestor/src/main/scala/ingestors.scala | 11 +++--- modules/ingestor/src/main/scala/package.scala | 29 +-------------- 5 files changed, 46 insertions(+), 42 deletions(-) diff --git a/modules/ingestor/src/main/scala/app.scala b/modules/ingestor/src/main/scala/app.scala index 3d8f3599..333bb755 100644 --- a/modules/ingestor/src/main/scala/app.scala +++ b/modules/ingestor/src/main/scala/app.scala @@ -33,8 +33,7 @@ object App extends IOApp.Simple: class IngestorApp(res: AppResources, config: AppConfig)(using Logger[IO], LoggerFactory[IO]): def run(): Resource[IO, Unit] = - given ESClient[IO] = res.elastic - Ingestors(res.lichess, res.study, res.studyLocal, res.store, config.ingestor) + Ingestors(res.lichess, res.study, res.studyLocal, res.store, res.elastic, config.ingestor) .flatMap(_.run()) .toResource .evalTap(_ => Logger[IO].info("Ingestor started")) diff --git a/modules/ingestor/src/main/scala/cli.scala b/modules/ingestor/src/main/scala/cli.scala index 9279c0b9..271857c6 100644 --- a/modules/ingestor/src/main/scala/cli.scala +++ b/modules/ingestor/src/main/scala/cli.scala @@ -32,8 +32,14 @@ object cli for config <- AppConfig.load.toResource res <- AppResources.instance(config) - given ESClient[IO] = res.elastic - ingestor <- Ingestors(res.lichess, res.study, res.studyLocal, res.store, config.ingestor).toResource + ingestor <- Ingestors( + res.lichess, + res.study, + res.studyLocal, + res.store, + res.elastic, + config.ingestor + ).toResource yield ingestor extension (ingestor: Ingestors) diff --git a/modules/ingestor/src/main/scala/ingestor.scala b/modules/ingestor/src/main/scala/ingestor.scala index 713467c1..0b32aecf 100644 --- a/modules/ingestor/src/main/scala/ingestor.scala +++ b/modules/ingestor/src/main/scala/ingestor.scala @@ -19,10 +19,13 @@ trait Ingestor: object Ingestor: - def apply[A: Schema](index: Index, repo: Repo[A], store: KVStore, defaultStartAt: Option[Instant])(using - LoggerFactory[IO], - ESClient[IO] - ): Ingestor = new: + def apply[A: Schema]( + index: Index, + repo: Repo[A], + store: KVStore, + elastic: ESClient[IO], + defaultStartAt: Option[Instant] + )(using LoggerFactory[IO]): Ingestor = new: given Logger[IO] = LoggerFactory[IO].getLogger def watch: IO[Unit] = @@ -30,7 +33,7 @@ object Ingestor: .eval(startAt) .flatMap(repo.watch) .evalMap: result => - updateElastic(result, false) *> store.saveLastIndexedTimestamp(index, result.timestamp) + updateElastic(result, false) *> saveLastIndexedTimestamp(index, result.timestamp) .compile .drain @@ -61,3 +64,25 @@ object Ingestor: defaultStartAt .fold(store.get(index.value))(_.some.pure[IO]) .flatTap(since => info"Starting ${index.value} ingestor from $since") + + private def deleteMany(index: Index, ids: List[Id]): IO[Unit] = + elastic + .deleteMany(index, ids) + .flatTap(_ => Logger[IO].info(s"Deleted ${ids.size} ${index.value}s")) + .handleErrorWith: e => + Logger[IO].error(e)(s"Failed to delete ${index.value}: ${ids.map(_.value).mkString(", ")}") + .whenA(ids.nonEmpty) + + private def storeBulk(index: Index, sources: List[(String, A)]): IO[Unit] = + Logger[IO].info(s"Received ${sources.size} docs to ${index.value}") *> + elastic + .storeBulk(index, sources) + .handleErrorWith: e => + Logger[IO].error(e)(s"Failed to ${index.value} index: ${sources.map(_._1).mkString(", ")}") + .whenA(sources.nonEmpty) + *> Logger[IO].info(s"Indexed ${sources.size} ${index.value}s") + + private def saveLastIndexedTimestamp(index: Index, time: Option[Instant]): IO[Unit] = + val savedTime = time.getOrElse(Instant.now()) + store.put(index.value, savedTime) + *> Logger[IO].info(s"Stored last indexed time ${savedTime.getEpochSecond} for $index") diff --git a/modules/ingestor/src/main/scala/ingestors.scala b/modules/ingestor/src/main/scala/ingestors.scala index 29680f94..9a36057a 100644 --- a/modules/ingestor/src/main/scala/ingestors.scala +++ b/modules/ingestor/src/main/scala/ingestors.scala @@ -22,8 +22,9 @@ object Ingestors: study: MongoDatabase[IO], local: MongoDatabase[IO], store: KVStore, + elastic: ESClient[IO], config: IngestorConfig - )(using LoggerFactory[IO], ESClient[IO]): IO[Ingestors] = + )(using LoggerFactory[IO]): IO[Ingestors] = ( ForumRepo(lichess, config.forum), StudyRepo(study, local, config.study), @@ -31,8 +32,8 @@ object Ingestors: TeamRepo(lichess, config.team) ).mapN: (forums, studies, games, teams) => new Ingestors( - Ingestor(Index.Forum, forums, store, config.forum.startAt), - Ingestor(Index.Study, studies, store, config.study.startAt), - Ingestor(Index.Game, games, store, config.game.startAt), - Ingestor(Index.Team, teams, store, config.team.startAt) + Ingestor(Index.Forum, forums, store, elastic, config.forum.startAt), + Ingestor(Index.Study, studies, store, elastic, config.study.startAt), + Ingestor(Index.Game, games, store, elastic, config.game.startAt), + Ingestor(Index.Team, teams, store, elastic, config.team.startAt) ) diff --git a/modules/ingestor/src/main/scala/package.scala b/modules/ingestor/src/main/scala/package.scala index a0c29859..4bc2a250 100644 --- a/modules/ingestor/src/main/scala/package.scala +++ b/modules/ingestor/src/main/scala/package.scala @@ -2,7 +2,6 @@ package lila.search package ingestor import cats.effect.IO -import cats.syntax.all.* import com.github.plokhotnyuk.jsoniter_scala.core.* import com.sksamuel.elastic4s.Indexable import mongo4cats.bson.Document @@ -10,7 +9,6 @@ import mongo4cats.collection.GenericMongoCollection import mongo4cats.models.collection.ChangeStreamDocument import mongo4cats.operations.Filter import org.bson.BsonTimestamp -import org.typelevel.log4cats.Logger import smithy4s.json.Json.given import smithy4s.schema.Schema @@ -29,8 +27,6 @@ extension (doc: Document) private def id: Option[String] = doc.getString(_id) -given [A: Schema]: Indexable[A] = (a: A) => writeToString(a) - extension (instant: Instant) inline def asBsonTimestamp: BsonTimestamp = BsonTimestamp(instant.getEpochSecond.toInt, 1) @@ -38,29 +34,6 @@ def range(field: String)(since: Instant, until: Option[Instant]): Filter = inline def gtes = Filter.gte(field, since) until.fold(gtes)(until => gtes.and(Filter.lt(field, until))) -def deleteMany(index: Index, ids: List[Id])(using Logger[IO])(using elastic: ESClient[IO]): IO[Unit] = - elastic - .deleteMany(index, ids) - .flatTap(_ => Logger[IO].info(s"Deleted ${ids.size} ${index.value}s")) - .handleErrorWith: e => - Logger[IO].error(e)(s"Failed to delete ${index.value}: ${ids.map(_.value).mkString(", ")}") - .whenA(ids.nonEmpty) - -def storeBulk[A](index: Index, sources: List[(String, A)])(using Schema[A], Logger[IO])(using - elastic: ESClient[IO] -): IO[Unit] = - Logger[IO].info(s"Received ${sources.size} docs to ${index.value}") *> - elastic - .storeBulk(index, sources) - .handleErrorWith: e => - Logger[IO].error(e)(s"Failed to ${index.value} index: ${sources.map(_._1).mkString(", ")}") - .whenA(sources.nonEmpty) - *> Logger[IO].info(s"Indexed ${sources.size} ${index.value}s") - extension (s: String) def dollarPrefix = "$" + s -extension (store: KVStore) - def saveLastIndexedTimestamp(index: Index, time: Option[Instant])(using Logger[IO]): IO[Unit] = - val savedTime = time.getOrElse(Instant.now()) - store.put(index.value, savedTime) - *> Logger[IO].info(s"Stored last indexed time ${savedTime.getEpochSecond} for $index") +given [A: Schema]: Indexable[A] = (a: A) => writeToString(a) From 43d47d807854cbe20f9cd8025955eb3a27c4c640 Mon Sep 17 00:00:00 2001 From: Thanh Le Date: Wed, 27 Nov 2024 21:06:55 +0100 Subject: [PATCH 16/18] Minor clean up + logs --- modules/ingestor/src/main/scala/Repo.scala | 9 ++---- .../ingestor/src/main/scala/mongo.forum.scala | 27 ++++++++--------- .../ingestor/src/main/scala/mongo.game.scala | 17 ++++++----- .../ingestor/src/main/scala/mongo.study.scala | 9 +++--- .../ingestor/src/main/scala/mongo.team.scala | 29 ++++++++++--------- 5 files changed, 45 insertions(+), 46 deletions(-) diff --git a/modules/ingestor/src/main/scala/Repo.scala b/modules/ingestor/src/main/scala/Repo.scala index 117e381c..75c26396 100644 --- a/modules/ingestor/src/main/scala/Repo.scala +++ b/modules/ingestor/src/main/scala/Repo.scala @@ -1,16 +1,13 @@ package lila.search package ingestor -import cats.effect.* -import cats.syntax.all.* +import cats.effect.IO import java.time.Instant -import Repo.Result - trait Repo[A]: - def watch(since: Option[Instant]): fs2.Stream[IO, Result[A]] - def fetch(since: Instant, until: Instant): fs2.Stream[IO, Result[A]] + def watch(since: Option[Instant]): fs2.Stream[IO, Repo.Result[A]] + def fetch(since: Instant, until: Instant): fs2.Stream[IO, Repo.Result[A]] object Repo: type SourceWithId[A] = (String, A) diff --git a/modules/ingestor/src/main/scala/mongo.forum.scala b/modules/ingestor/src/main/scala/mongo.forum.scala index f18c0600..263e3dfe 100644 --- a/modules/ingestor/src/main/scala/mongo.forum.scala +++ b/modules/ingestor/src/main/scala/mongo.forum.scala @@ -52,19 +52,20 @@ object ForumRepo: val filter = range(F.createdAt)(since, until.some) .or(range(F.updatedAt)(since, until.some)) .or(range(F.erasedAt)(since, until.some)) - posts - .find(filter) - .projection(postProjection) - .boundedStream(config.batchSize) - .filter(_.validText) - .chunkN(config.batchSize) - .map(_.toList) - .metered(1.second) - .evalMap: events => - val (toDelete, toIndex) = events.partition(_.isErased) - toIndex.toSources - .map: sources => - Result(sources, toDelete.flatten(_.id.map(Id.apply)), none) + fs2.Stream.eval(info"Fetching teams from $since to $until") *> + posts + .find(filter) + .projection(postProjection) + .boundedStream(config.batchSize) + .filter(_.validText) + .chunkN(config.batchSize) + .map(_.toList) + .metered(1.second) + .evalMap: events => + val (toDelete, toIndex) = events.partition(_.isErased) + toIndex.toSources + .map: sources => + Result(sources, toDelete.flatten(_.id.map(Id.apply)), none) def watch(since: Option[Instant]): fs2.Stream[IO, Result[ForumSource]] = val builder = posts.watch(aggregate(config.maxPostLength)) diff --git a/modules/ingestor/src/main/scala/mongo.game.scala b/modules/ingestor/src/main/scala/mongo.game.scala index a0b5e3d4..88fe2585 100644 --- a/modules/ingestor/src/main/scala/mongo.game.scala +++ b/modules/ingestor/src/main/scala/mongo.game.scala @@ -86,14 +86,15 @@ object GameRepo: def fetch(since: Instant, until: Instant): fs2.Stream[IO, Result[GameSource]] = val filter = range(F.createdAt)(since, until.some) .or(range(F.updatedAt)(since, until.some)) - games - .find(filter.and(gameFilter)) - // .projection(postProjection) - .boundedStream(config.batchSize) - .chunkN(config.batchSize) - .map(_.toList) - .metered(1.second) // to avoid overloading the elasticsearch - .map(ds => Result(ds.map(_.toSource), Nil, none)) + fs2.Stream.eval(info"Fetching teams from $since to $until") *> + games + .find(filter.and(gameFilter)) + // .projection(postProjection) + .boundedStream(config.batchSize) + .chunkN(config.batchSize) + .map(_.toList) + .metered(1.second) // to avoid overloading the elasticsearch + .map(ds => Result(ds.map(_.toSource), Nil, none)) private def changes(since: Option[Instant]): fs2.Stream[IO, List[ChangeStreamDocument[DbGame]]] = val builder = games.watch(aggregate) diff --git a/modules/ingestor/src/main/scala/mongo.study.scala b/modules/ingestor/src/main/scala/mongo.study.scala index 7313fdd2..66d643fe 100644 --- a/modules/ingestor/src/main/scala/mongo.study.scala +++ b/modules/ingestor/src/main/scala/mongo.study.scala @@ -41,11 +41,10 @@ object StudyRepo: .flatMap(fetch) def fetch(since: Instant, until: Instant): fs2.Stream[IO, Result[StudySource]] = - // fs2.Stream.eval(info"Indexing studies from $since to $until") ++ - // fs2.Stream.eval(info"deleting studies from $since to $until") ++ - pullAndIndex(since, until) - .zip(pullAndDelete(since, until)) - .map((toIndex, toDelete) => Result(toIndex, toDelete, until.some)) + fs2.Stream.eval(info"Fetching studies from $since to $until") *> + pullAndIndex(since, until) + .zip(pullAndDelete(since, until)) + .map((toIndex, toDelete) => Result(toIndex, toDelete, until.some)) def pullAndIndex(since: Instant, until: Instant) = val filter = range(F.createdAt)(since, until.some) diff --git a/modules/ingestor/src/main/scala/mongo.team.scala b/modules/ingestor/src/main/scala/mongo.team.scala index 4565f62b..e7fad29a 100644 --- a/modules/ingestor/src/main/scala/mongo.team.scala +++ b/modules/ingestor/src/main/scala/mongo.team.scala @@ -66,20 +66,21 @@ object TeamRepo: val filter = range(F.createdAt)(since, until.some) .or(range(F.updatedAt)(since, until.some)) .or(range(F.erasedAt)(since, until.some)) - teams - .find(filter) - .projection(postProjection) - .boundedStream(config.batchSize) - .chunkN(config.batchSize) - .map(_.toList) - .metered(1.second) // to avoid overloading the elasticsearch - .map: docs => - val (toDelete, toIndex) = docs.partition(!_.isEnabled) - Result( - toIndex.toSources, - toDelete.flatten(_.id.map(Id.apply)), - none - ) + fs2.Stream.eval(info"Fetching teams from $since to $until") *> + teams + .find(filter) + .projection(postProjection) + .boundedStream(config.batchSize) + .chunkN(config.batchSize) + .map(_.toList) + .metered(1.second) // to avoid overloading the elasticsearch + .map: docs => + val (toDelete, toIndex) = docs.partition(!_.isEnabled) + Result( + toIndex.toSources, + toDelete.flatten(_.id.map(Id.apply)), + none + ) extension (docs: List[Document]) private def toSources: List[(String, TeamSource)] = From cd4a64850ac9865a09d9acc17ee6cfda43b2b386 Mon Sep 17 00:00:00 2001 From: Thanh Le Date: Wed, 27 Nov 2024 21:36:26 +0100 Subject: [PATCH 17/18] Use all instead of compile.stream.toList --- modules/ingestor/src/main/scala/mongo.chapter.scala | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/modules/ingestor/src/main/scala/mongo.chapter.scala b/modules/ingestor/src/main/scala/mongo.chapter.scala index 2d87eb2d..78bc37ea 100644 --- a/modules/ingestor/src/main/scala/mongo.chapter.scala +++ b/modules/ingestor/src/main/scala/mongo.chapter.scala @@ -108,8 +108,6 @@ object ChapterRepo: def byStudyIds(ids: List[String]): IO[Map[String, StudyData]] = coll .aggregateWithCodec[StudyData](Query.aggregate(ids)) - .stream - .compile - .toList + .all .flatTap(docs => Logger[IO].debug(s"Received $docs chapters")) .map(_.map(x => x._id -> x).toMap) From 905b2c509c8980586b4f1a77f9fd8f3130f2370e Mon Sep 17 00:00:00 2001 From: Thanh Le Date: Wed, 27 Nov 2024 22:21:52 +0100 Subject: [PATCH 18/18] Remove package file --- .../e2e/src/test/scala/IntegrationSuite.scala | 2 +- modules/ingestor/src/main/scala/Repo.scala | 31 +++++++++++++++ .../ingestor/src/main/scala/ingestor.scala | 5 +++ .../src/main/scala/mongo.chapter.scala | 2 + .../ingestor/src/main/scala/mongo.forum.scala | 2 +- .../ingestor/src/main/scala/mongo.game.scala | 2 +- .../ingestor/src/main/scala/mongo.study.scala | 2 +- .../ingestor/src/main/scala/mongo.team.scala | 2 +- modules/ingestor/src/main/scala/package.scala | 39 ------------------- 9 files changed, 43 insertions(+), 44 deletions(-) delete mode 100644 modules/ingestor/src/main/scala/package.scala diff --git a/modules/e2e/src/test/scala/IntegrationSuite.scala b/modules/e2e/src/test/scala/IntegrationSuite.scala index 18a764da..717a2e80 100644 --- a/modules/e2e/src/test/scala/IntegrationSuite.scala +++ b/modules/e2e/src/test/scala/IntegrationSuite.scala @@ -5,7 +5,7 @@ package test import cats.effect.{ IO, Resource } import cats.syntax.all.* import com.comcast.ip4s.* -import lila.search.ingestor.given +import lila.search.ingestor.Ingestor.given import lila.search.spec.* import org.http4s.Uri import org.typelevel.log4cats.noop.{ NoOpFactory, NoOpLogger } diff --git a/modules/ingestor/src/main/scala/Repo.scala b/modules/ingestor/src/main/scala/Repo.scala index 75c26396..a111cc3d 100644 --- a/modules/ingestor/src/main/scala/Repo.scala +++ b/modules/ingestor/src/main/scala/Repo.scala @@ -12,3 +12,34 @@ trait Repo[A]: object Repo: type SourceWithId[A] = (String, A) case class Result[A](toIndex: List[SourceWithId[A]], toDelete: List[Id], timestamp: Option[Instant]) + + import cats.effect.IO + import mongo4cats.bson.Document + import mongo4cats.collection.GenericMongoCollection + import mongo4cats.models.collection.ChangeStreamDocument + import mongo4cats.operations.Filter + import org.bson.BsonTimestamp + + import java.time.Instant + + val _id = "_id" + + type MongoCollection = GenericMongoCollection[IO, Document, [A] =>> fs2.Stream[IO, A]] + + given [A]: HasDocId[ChangeStreamDocument[A]] with + extension (change: ChangeStreamDocument[A]) + def docId: Option[String] = + change.documentKey.flatMap(_.id) + + extension (doc: Document) + def id: Option[String] = + doc.getString(_id) + + extension (instant: Instant) + inline def asBsonTimestamp: BsonTimestamp = BsonTimestamp(instant.getEpochSecond.toInt, 1) + + def range(field: String)(since: Instant, until: Option[Instant]): Filter = + inline def gtes = Filter.gte(field, since) + until.fold(gtes)(until => gtes.and(Filter.lt(field, until))) + + extension (s: String) def dollarPrefix = "$" + s diff --git a/modules/ingestor/src/main/scala/ingestor.scala b/modules/ingestor/src/main/scala/ingestor.scala index 0b32aecf..27fbd8f5 100644 --- a/modules/ingestor/src/main/scala/ingestor.scala +++ b/modules/ingestor/src/main/scala/ingestor.scala @@ -3,8 +3,11 @@ package ingestor import cats.effect.* import cats.syntax.all.* +import com.github.plokhotnyuk.jsoniter_scala.core.* +import com.sksamuel.elastic4s.Indexable import org.typelevel.log4cats.syntax.* import org.typelevel.log4cats.{ Logger, LoggerFactory } +import smithy4s.json.Json.given import smithy4s.schema.Schema import java.time.Instant @@ -19,6 +22,8 @@ trait Ingestor: object Ingestor: + given [A: Schema]: Indexable[A] = (a: A) => writeToString(a) + def apply[A: Schema]( index: Index, repo: Repo[A], diff --git a/modules/ingestor/src/main/scala/mongo.chapter.scala b/modules/ingestor/src/main/scala/mongo.chapter.scala index 78bc37ea..85d265b6 100644 --- a/modules/ingestor/src/main/scala/mongo.chapter.scala +++ b/modules/ingestor/src/main/scala/mongo.chapter.scala @@ -11,6 +11,8 @@ import mongo4cats.database.MongoDatabase import mongo4cats.operations.{ Accumulator, Aggregate, Filter } import org.typelevel.log4cats.Logger +import Repo.* + trait ChapterRepo: // Aggregate chapters data and convert them to StudyChapterText by their study ids def byStudyIds(ids: List[String]): IO[Map[String, StudyData]] diff --git a/modules/ingestor/src/main/scala/mongo.forum.scala b/modules/ingestor/src/main/scala/mongo.forum.scala index 263e3dfe..eb867d7a 100644 --- a/modules/ingestor/src/main/scala/mongo.forum.scala +++ b/modules/ingestor/src/main/scala/mongo.forum.scala @@ -15,7 +15,7 @@ import org.typelevel.log4cats.{ Logger, LoggerFactory } import java.time.Instant import scala.concurrent.duration.* -import Repo.{ Result, SourceWithId } +import Repo.{ *, given } object ForumRepo: diff --git a/modules/ingestor/src/main/scala/mongo.game.scala b/modules/ingestor/src/main/scala/mongo.game.scala index 88fe2585..2892af57 100644 --- a/modules/ingestor/src/main/scala/mongo.game.scala +++ b/modules/ingestor/src/main/scala/mongo.game.scala @@ -20,7 +20,7 @@ import org.typelevel.log4cats.{ Logger, LoggerFactory } import java.time.Instant import scala.concurrent.duration.* -import Repo.Result +import Repo.{ *, given } object GameRepo: diff --git a/modules/ingestor/src/main/scala/mongo.study.scala b/modules/ingestor/src/main/scala/mongo.study.scala index 66d643fe..abeb3e04 100644 --- a/modules/ingestor/src/main/scala/mongo.study.scala +++ b/modules/ingestor/src/main/scala/mongo.study.scala @@ -11,7 +11,7 @@ import org.typelevel.log4cats.{ Logger, LoggerFactory } import java.time.Instant -import Repo.Result +import Repo.* object StudyRepo: diff --git a/modules/ingestor/src/main/scala/mongo.team.scala b/modules/ingestor/src/main/scala/mongo.team.scala index e7fad29a..5290c31f 100644 --- a/modules/ingestor/src/main/scala/mongo.team.scala +++ b/modules/ingestor/src/main/scala/mongo.team.scala @@ -15,7 +15,7 @@ import org.typelevel.log4cats.{ Logger, LoggerFactory } import java.time.Instant import scala.concurrent.duration.* -import Repo.Result +import Repo.{ *, given } object TeamRepo: diff --git a/modules/ingestor/src/main/scala/package.scala b/modules/ingestor/src/main/scala/package.scala deleted file mode 100644 index 4bc2a250..00000000 --- a/modules/ingestor/src/main/scala/package.scala +++ /dev/null @@ -1,39 +0,0 @@ -package lila.search -package ingestor - -import cats.effect.IO -import com.github.plokhotnyuk.jsoniter_scala.core.* -import com.sksamuel.elastic4s.Indexable -import mongo4cats.bson.Document -import mongo4cats.collection.GenericMongoCollection -import mongo4cats.models.collection.ChangeStreamDocument -import mongo4cats.operations.Filter -import org.bson.BsonTimestamp -import smithy4s.json.Json.given -import smithy4s.schema.Schema - -import java.time.Instant - -val _id = "_id" - -type MongoCollection = GenericMongoCollection[IO, Document, [A] =>> fs2.Stream[IO, A]] - -given [A]: HasDocId[ChangeStreamDocument[A]] with - extension (change: ChangeStreamDocument[A]) - def docId: Option[String] = - change.documentKey.flatMap(_.id) - -extension (doc: Document) - private def id: Option[String] = - doc.getString(_id) - -extension (instant: Instant) - inline def asBsonTimestamp: BsonTimestamp = BsonTimestamp(instant.getEpochSecond.toInt, 1) - -def range(field: String)(since: Instant, until: Option[Instant]): Filter = - inline def gtes = Filter.gte(field, since) - until.fold(gtes)(until => gtes.and(Filter.lt(field, until))) - -extension (s: String) def dollarPrefix = "$" + s - -given [A: Schema]: Indexable[A] = (a: A) => writeToString(a)