Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add options to scrap #4

Open
github-actions bot opened this issue Sep 8, 2023 · 0 comments
Open

add options to scrap #4

github-actions bot opened this issue Sep 8, 2023 · 0 comments
Labels

Comments

@github-actions
Copy link

github-actions bot commented Sep 8, 2023

  • for all event sources

  • for all event sources of a specific type

  • for a specific event source by its uri

const eventSources = await db.eventSources.findAll();

const scrappers = new Map<EventSourceType, Scrapper>([

https://api.github.com/vorant94/sofash/blob/fcb3ff714b19baf963f745027eebdd11e63af32c/apps/cli/src/scrap/index.ts#L11

import { Command } from 'commander';
import { type Db, type EventSourceType } from 'db';
import { type Client } from 'tdl';
import { TelegramScrapper } from './scrappers/telegram.scrapper.js';
import { type Mq, type RawEventJob } from 'mq';
import { type Scrapper } from './scrappers/scrapper.js';
import { CONTAINER, DB, LOGGER, MQ, TELEGRAM } from '../shared/container.js';
import { type Logger } from 'logger';
import { MeetupScrapper } from './scrappers/meetup.scrapper.js';

// TODO add options to scrap
//  - for all event sources
//  - for all event sources of a specific type
//  - for a specific event source by its uri
export const SCRAP_COMMAND = new Command('scrap').action(async () => {
  const db = CONTAINER.get<Db>(DB);
  const telegram = CONTAINER.get<Client>(TELEGRAM);
  const mq = CONTAINER.get<Mq>(MQ);
  const logger = CONTAINER.get<Logger>(LOGGER).clone('ScrapCommand');

  //
  // collect event sources
  //
  const eventSources = await db.eventSources.findAll();
  logger.info(`collected [${eventSources.length}] event sources to scrap`);

  //
  // configure scrappers
  //
  const scrappers = new Map<EventSourceType, Scrapper>([
    // TODO fix proper generic types here to avoid manual type assertion
    ['telegram', new TelegramScrapper(telegram, logger) as Scrapper],
    ['meetup', new MeetupScrapper(logger) as Scrapper],
  ]);

  //
  // process all event sources in parallel
  //
  const results = await Promise.allSettled(
    eventSources.map(async (eventSource) => {
      const scrapper = scrappers.get(eventSource.type);
      if (scrapper == null) {
        throw new Error(
          `No scrapper is configured for event source type [${eventSource.type}]`,
        );
      }

      //
      // scrap event source
      //
      const contents = await scrapper.scrapEventSource(eventSource);
      if (contents.length === 0) {
        return;
      }

      //
      // queue raw event jobs
      //
      const jobs: RawEventJob[] = contents.map((content) =>
        scrapper.createRawEventJob(eventSource, content),
      );
      await mq.rawEvents.queueJobsBulk(jobs);
      logger.info(
        `successfully queued [${jobs.length}] raw event jobs for event source [${eventSource.uri}]`,
        { uri: eventSource.uri },
      );

      //
      // update latest scrapped message id
      //
      await db.eventSources.updateLatestScrappedMessageId(
        eventSource.id,
        scrapper.getScrappedMessageId(contents[0]),
      );
    }),
  );

  const esToResult = new Map(
    eventSources.map((eventSource, index) => {
      const result = results.at(index);
      if (result == null) {
        throw new Error(
          `no scrap result is found for event source with index [${index}] and uri [${eventSource.uri}]`,
        );
      }

      return [eventSource, result];
    }),
  );

  //
  // log results
  //
  const esUriToReason = new Map<string, any>();
  for (const [eventSource, result] of esToResult) {
    if (result.status === 'rejected') {
      esUriToReason.set(eventSource.uri, result.reason);
    }
  }

  const successCount = esToResult.size - esUriToReason.size;
  logger.info(`successfully scrapped [${successCount}] event sources`);

  if (esUriToReason.size === 0) {
    logger.info(`no event source failed to scrap`);
    return;
  }

  for (const [uri, reason] of esUriToReason) {
    const message =
      reason instanceof Error
        ? reason.message
        : JSON.stringify(reason, null, 2);

    logger.error(`event source [${uri}] finished with errors [${message}]`, {
      uri,
    });
  }
});
@github-actions github-actions bot added the todo label Sep 8, 2023
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

No branches or pull requests

0 participants