Skip to content

Commit

Permalink
Merge pull request #375 from lichess-org/ingestor-refactor/0
Browse files Browse the repository at this point in the history
Refactor ingestor module
  • Loading branch information
lenguyenthanh authored Nov 29, 2024
2 parents 247064a + 905b2c5 commit d5ad73e
Show file tree
Hide file tree
Showing 15 changed files with 344 additions and 400 deletions.
4 changes: 2 additions & 2 deletions modules/app/src/main/scala/app.scala
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ package app

import cats.effect.*
import cats.syntax.all.*
import org.typelevel.log4cats.slf4j.{ Slf4jFactory, Slf4jLogger }
import org.typelevel.log4cats.slf4j.Slf4jFactory
import org.typelevel.log4cats.{ Logger, LoggerFactory }
import org.typelevel.otel4s.experimental.metrics.*
import org.typelevel.otel4s.metrics.Meter
Expand All @@ -14,8 +14,8 @@ import org.typelevel.otel4s.sdk.metrics.exporter.MetricExporter

object App extends IOApp.Simple:

given Logger[IO] = Slf4jLogger.getLogger[IO]
given LoggerFactory[IO] = Slf4jFactory.create[IO]
given Logger[IO] = LoggerFactory[IO].getLogger

override def run: IO[Unit] = app.useForever

Expand Down
2 changes: 1 addition & 1 deletion modules/app/src/main/scala/service.health.scala
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import org.typelevel.log4cats.{ Logger, LoggerFactory }

class HealthServiceImpl(esClient: ESClient[IO])(using LoggerFactory[IO]) extends HealthService[IO]:

given logger: Logger[IO] = summon[LoggerFactory[IO]].getLogger
given logger: Logger[IO] = LoggerFactory[IO].getLogger

override def healthCheck(): IO[HealthCheckOutput] =
esClient.status
Expand Down
9 changes: 1 addition & 8 deletions modules/app/src/main/scala/service.search.scala
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package lila.search
package app

import cats.effect.*
import com.sksamuel.elastic4s.Indexable
import io.github.arainko.ducktape.*
import lila.search.forum.Forum
import lila.search.game.Game
Expand All @@ -11,15 +10,14 @@ import lila.search.study.Study
import lila.search.team.Team
import org.typelevel.log4cats.{ Logger, LoggerFactory }
import smithy4s.Timestamp
import smithy4s.schema.Schema

import java.time.Instant

class SearchServiceImpl(esClient: ESClient[IO])(using LoggerFactory[IO]) extends SearchService[IO]:

import SearchServiceImpl.given

given logger: Logger[IO] = summon[LoggerFactory[IO]].getLogger
given logger: Logger[IO] = LoggerFactory[IO].getLogger

override def count(query: Query): IO[CountOutput] =
esClient
Expand Down Expand Up @@ -68,8 +66,3 @@ object SearchServiceImpl:
case _: Query.Game => Index.Game
case _: Query.Study => Index.Study
case _: Query.Team => Index.Team

import smithy4s.json.Json.given
import com.github.plokhotnyuk.jsoniter_scala.core.*

given [A: Schema]: Indexable[A] = (a: A) => writeToString(a)
4 changes: 2 additions & 2 deletions modules/e2e/src/test/scala/IntegrationSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ package test
import cats.effect.{ IO, Resource }
import cats.syntax.all.*
import com.comcast.ip4s.*
import lila.search.ingestor.given
import lila.search.ingestor.Ingestor.given
import lila.search.spec.*
import org.http4s.Uri
import org.typelevel.log4cats.noop.{ NoOpFactory, NoOpLogger }
Expand Down Expand Up @@ -41,7 +41,7 @@ object IntegrationSuite extends IOSuite:

def testAppConfig(elastic: ElasticConfig) = AppConfig(
server =
HttpServerConfig(ip"0.0.0.0", port"9999", apiLogger = false, shutdownTimeout = 30, enableDocs = false),
HttpServerConfig(ip"0.0.0.0", port"9999", apiLogger = false, shutdownTimeout = 1, enableDocs = false),
elastic = elastic
)

Expand Down
45 changes: 45 additions & 0 deletions modules/ingestor/src/main/scala/Repo.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
package lila.search
package ingestor

import cats.effect.IO

import java.time.Instant

trait Repo[A]:
def watch(since: Option[Instant]): fs2.Stream[IO, Repo.Result[A]]
def fetch(since: Instant, until: Instant): fs2.Stream[IO, Repo.Result[A]]

object Repo:
type SourceWithId[A] = (String, A)
case class Result[A](toIndex: List[SourceWithId[A]], toDelete: List[Id], timestamp: Option[Instant])

import cats.effect.IO
import mongo4cats.bson.Document
import mongo4cats.collection.GenericMongoCollection
import mongo4cats.models.collection.ChangeStreamDocument
import mongo4cats.operations.Filter
import org.bson.BsonTimestamp

import java.time.Instant

val _id = "_id"

type MongoCollection = GenericMongoCollection[IO, Document, [A] =>> fs2.Stream[IO, A]]

given [A]: HasDocId[ChangeStreamDocument[A]] with
extension (change: ChangeStreamDocument[A])
def docId: Option[String] =
change.documentKey.flatMap(_.id)

extension (doc: Document)
def id: Option[String] =
doc.getString(_id)

extension (instant: Instant)
inline def asBsonTimestamp: BsonTimestamp = BsonTimestamp(instant.getEpochSecond.toInt, 1)

def range(field: String)(since: Instant, until: Option[Instant]): Filter =
inline def gtes = Filter.gte(field, since)
until.fold(gtes)(until => gtes.and(Filter.lt(field, until)))

extension (s: String) def dollarPrefix = "$" + s
6 changes: 3 additions & 3 deletions modules/ingestor/src/main/scala/app.scala
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ package lila.search
package ingestor

import cats.effect.*
import org.typelevel.log4cats.slf4j.{ Slf4jFactory, Slf4jLogger }
import org.typelevel.log4cats.slf4j.Slf4jFactory
import org.typelevel.log4cats.{ Logger, LoggerFactory }
import org.typelevel.otel4s.experimental.metrics.*
import org.typelevel.otel4s.metrics.Meter
Expand All @@ -11,8 +11,8 @@ import org.typelevel.otel4s.sdk.metrics.SdkMetrics

object App extends IOApp.Simple:

given Logger[IO] = Slf4jLogger.getLogger[IO]
given LoggerFactory[IO] = Slf4jFactory.create[IO]
given Logger[IO] = LoggerFactory[IO].getLogger

override def run: IO[Unit] = app.useForever

Expand All @@ -33,7 +33,7 @@ object App extends IOApp.Simple:

class IngestorApp(res: AppResources, config: AppConfig)(using Logger[IO], LoggerFactory[IO]):
def run(): Resource[IO, Unit] =
Ingestor(res.lichess, res.study, res.studyLocal, res.elastic, res.store, config.ingestor)
Ingestors(res.lichess, res.study, res.studyLocal, res.store, res.elastic, config.ingestor)
.flatMap(_.run())
.toResource
.evalTap(_ => Logger[IO].info("Ingestor started"))
68 changes: 33 additions & 35 deletions modules/ingestor/src/main/scala/cli.scala
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import cats.syntax.all.*
import com.monovore.decline.*
import com.monovore.decline.effect.*
import lila.search.ingestor.opts.{ IndexOpts, WatchOpts }
import org.typelevel.log4cats.slf4j.{ Slf4jFactory, Slf4jLogger }
import org.typelevel.log4cats.slf4j.Slf4jFactory
import org.typelevel.log4cats.{ Logger, LoggerFactory }
import org.typelevel.otel4s.metrics.Meter

Expand All @@ -20,36 +20,29 @@ object cli
version = "3.0.0"
):

given Logger[IO] = Slf4jLogger.getLogger[IO]
given LoggerFactory[IO] = Slf4jFactory.create[IO]
given Logger[IO] = LoggerFactory[IO].getLogger
given Meter[IO] = Meter.noop[IO]

override def main: Opts[IO[ExitCode]] =
opts.parse.map: opts =>
makeExecutor.use(_.execute(opts).as(ExitCode.Success))
makeIngestor.use(_.execute(opts).as(ExitCode.Success))

def makeExecutor: Resource[IO, Executor] =
def makeIngestor: Resource[IO, Ingestors] =
for
config <- AppConfig.load.toResource
res <- AppResources.instance(config)
forum <- ForumIngestor(res.lichess, res.elastic, res.store, config.ingestor.forum).toResource
team <- TeamIngestor(res.lichess, res.elastic, res.store, config.ingestor.team).toResource
study <- StudyIngestor(
ingestor <- Ingestors(
res.lichess,
res.study,
res.studyLocal,
res.elastic,
res.store,
config.ingestor.study
res.elastic,
config.ingestor
).toResource
game <- GameIngestor(res.lichess, res.elastic, res.store, config.ingestor.game).toResource
yield Executor(forum, study, game, team)

class Executor(
val forum: ForumIngestor,
val study: StudyIngestor,
val game: GameIngestor,
val team: TeamIngestor
):
yield ingestor

extension (ingestor: Ingestors)
def execute(opts: IndexOpts | WatchOpts): IO[Unit] =
opts match
case opts: IndexOpts => index(opts)
Expand All @@ -58,28 +51,38 @@ object cli
def index(opts: IndexOpts): IO[Unit] =
opts.index match
case Index.Forum =>
forum.run(opts.since, opts.until, opts.dry).compile.drain
ingestor.forum.run(opts.since, opts.until, opts.dry)
case Index.Study =>
study.run(opts.since, opts.until, opts.dry).compile.drain
ingestor.study.run(opts.since, opts.until, opts.dry)
case Index.Game =>
game.run(opts.since, opts.until, opts.dry).compile.drain
ingestor.game.run(opts.since, opts.until, opts.dry)
case Index.Team =>
team.run(opts.since, opts.until, opts.dry).compile.drain
ingestor.team.run(opts.since, opts.until, opts.dry)
case _ =>
forum.run(opts.since, opts.until, opts.dry).compile.drain *>
study.run(opts.since, opts.until, opts.dry).compile.drain *>
game.run(opts.since, opts.until, opts.dry).compile.drain *>
team.run(opts.since, opts.until, opts.dry).compile.drain
ingestor.forum.run(opts.since, opts.until, opts.dry) *>
ingestor.study.run(opts.since, opts.until, opts.dry) *>
ingestor.game.run(opts.since, opts.until, opts.dry) *>
ingestor.team.run(opts.since, opts.until, opts.dry)

def watch(opts: WatchOpts): IO[Unit] =
opts.index match
case Index.Game =>
game.watch(opts.since.some, opts.dry).compile.drain
case _ => IO.println("We only support game watch for now")
ingestor.game.watch(opts.since.some, opts.dry)
case Index.Forum =>
ingestor.forum.watch(opts.since.some, opts.dry)
case Index.Team =>
ingestor.team.watch(opts.since.some, opts.dry)
case Index.Study =>
ingestor.study.watch(opts.since.some, opts.dry)
case _ =>
ingestor.forum.watch(opts.since.some, opts.dry) *>
ingestor.team.watch(opts.since.some, opts.dry) *>
ingestor.study.watch(opts.since.some, opts.dry) *>
ingestor.game.watch(opts.since.some, opts.dry)

object opts:
case class IndexOpts(index: Index | Unit, since: Instant, until: Instant, dry: Boolean)
case class WatchOpts(index: Index, since: Instant, dry: Boolean)
case class WatchOpts(index: Index | Unit, since: Instant, dry: Boolean)

def parse = Opts.subcommand("index", "index documents")(indexOpt) <+>
Opts.subcommand("watch", "watch change events and index documents")(watchOpt)
Expand Down Expand Up @@ -128,12 +131,7 @@ object opts:
)

val watchOpt = (
Opts.option[Index](
long = "index",
help = "Target index (only `game` for now)",
short = "i",
metavar = "forum|team|study|game"
),
singleIndexOpt orElse allIndexOpt,
Opts
.option[Instant](
long = "since",
Expand Down
106 changes: 82 additions & 24 deletions modules/ingestor/src/main/scala/ingestor.scala
Original file line number Diff line number Diff line change
Expand Up @@ -3,33 +3,91 @@ package ingestor

import cats.effect.*
import cats.syntax.all.*
import mongo4cats.database.MongoDatabase
import org.typelevel.log4cats.LoggerFactory
import com.github.plokhotnyuk.jsoniter_scala.core.*
import com.sksamuel.elastic4s.Indexable
import org.typelevel.log4cats.syntax.*
import org.typelevel.log4cats.{ Logger, LoggerFactory }
import smithy4s.json.Json.given
import smithy4s.schema.Schema

import java.time.Instant

trait Ingestor:
def run(): IO[Unit]
// watch change events from database and ingest documents into elastic search
def watch: IO[Unit]
// Similar to watch but started from a given timestamp
def watch(since: Option[Instant], dryRun: Boolean): IO[Unit]
// Fetch documents in [since, until] and ingest into elastic search
def run(since: Instant, until: Instant, dryRun: Boolean): IO[Unit]

object Ingestor:

def apply(
lichess: MongoDatabase[IO],
study: MongoDatabase[IO],
local: MongoDatabase[IO],
elastic: ESClient[IO],
given [A: Schema]: Indexable[A] = (a: A) => writeToString(a)

def apply[A: Schema](
index: Index,
repo: Repo[A],
store: KVStore,
config: IngestorConfig
)(using LoggerFactory[IO]): IO[Ingestor] =
(
ForumIngestor(lichess, elastic, store, config.forum),
TeamIngestor(lichess, elastic, store, config.team),
StudyIngestor(study, local, elastic, store, config.study),
GameIngestor(lichess, elastic, store, config.game)
).mapN: (forum, team, study, game) =>
new Ingestor:
def run() =
fs2
.Stream(forum.watch, team.watch, study.watch, game.watch)
.covary[IO]
.parJoinUnbounded
.compile
.drain
elastic: ESClient[IO],
defaultStartAt: Option[Instant]
)(using LoggerFactory[IO]): Ingestor = new:
given Logger[IO] = LoggerFactory[IO].getLogger

def watch: IO[Unit] =
fs2.Stream
.eval(startAt)
.flatMap(repo.watch)
.evalMap: result =>
updateElastic(result, false) *> saveLastIndexedTimestamp(index, result.timestamp)
.compile
.drain

def watch(since: Option[Instant], dryRun: Boolean): IO[Unit] =
repo
.watch(since)
.evalMap(updateElastic(_, dryRun))
.compile
.drain

def run(since: Instant, until: Instant, dryRun: Boolean): IO[Unit] =
repo
.fetch(since, until)
.evalMap(updateElastic(_, dryRun))
.compile
.drain

private def updateElastic(result: Repo.Result[A], dryRun: Boolean): IO[Unit] =
dryRun.fold(
info"Would index total ${result.toIndex.size} games and delete ${result.toDelete.size} games" *>
result.toIndex.traverse_(x => debug"Would index $x")
*> result.toDelete.traverse_(x => debug"Would delete $x"),
storeBulk(index, result.toIndex)
*> deleteMany(index, result.toDelete)
)

private def startAt: IO[Option[Instant]] =
defaultStartAt
.fold(store.get(index.value))(_.some.pure[IO])
.flatTap(since => info"Starting ${index.value} ingestor from $since")

private def deleteMany(index: Index, ids: List[Id]): IO[Unit] =
elastic
.deleteMany(index, ids)
.flatTap(_ => Logger[IO].info(s"Deleted ${ids.size} ${index.value}s"))
.handleErrorWith: e =>
Logger[IO].error(e)(s"Failed to delete ${index.value}: ${ids.map(_.value).mkString(", ")}")
.whenA(ids.nonEmpty)

private def storeBulk(index: Index, sources: List[(String, A)]): IO[Unit] =
Logger[IO].info(s"Received ${sources.size} docs to ${index.value}") *>
elastic
.storeBulk(index, sources)
.handleErrorWith: e =>
Logger[IO].error(e)(s"Failed to ${index.value} index: ${sources.map(_._1).mkString(", ")}")
.whenA(sources.nonEmpty)
*> Logger[IO].info(s"Indexed ${sources.size} ${index.value}s")

private def saveLastIndexedTimestamp(index: Index, time: Option[Instant]): IO[Unit] =
val savedTime = time.getOrElse(Instant.now())
store.put(index.value, savedTime)
*> Logger[IO].info(s"Stored last indexed time ${savedTime.getEpochSecond} for $index")
Loading

0 comments on commit d5ad73e

Please sign in to comment.