Skip to content

Commit

Permalink
fix: change primary db cache to mongo
Browse files Browse the repository at this point in the history
  • Loading branch information
rayriffy committed Feb 14, 2025
1 parent 28a8594 commit 6243b59
Show file tree
Hide file tree
Showing 7 changed files with 89 additions and 89 deletions.
3 changes: 1 addition & 2 deletions scraper/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,8 @@
"dist"
],
"dependencies": {
"kysely": "0.27.4",
"mongodb": "6.13.0",
"p-queue": "8.0.1",
"pg": "8.13.1",
"puppeteer": "24.1.1",
"puppeteer-extra": "3.3.6",
"puppeteer-extra-plugin-stealth": "2.11.2",
Expand Down
43 changes: 34 additions & 9 deletions scraper/src/commands/fetch.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,19 @@ import { chunk } from '../constants/chunk'
import { cacheDirectory } from '../constants/cacheDirectory'
import { hentaiDirectory } from '../constants/hentaiDirectory'
import { parseUrl } from "../functions/parseUrl";
import {getGalleriesViaBrowser} from "../functions/getGalleriesViaBrowser";
import {getGalleriesViaFetch} from "../functions/getGalleriesViaFetch";
import { getGalleriesViaBrowser } from "../functions/getGalleriesViaBrowser";
import { getGalleriesViaFetch } from "../functions/getGalleriesViaFetch";
import { collections } from "../constants/mongo";
import { writeItem } from "../functions/writeItem";

export const fetch = async (entryPoint: string, browserMode: boolean) => {
if (process.env.MONGODB_URL === undefined) {
console.error(
'no database url provided, please provide postgres connection url'
)
return process.exit(1)
}

const { default: codes } = (await import(
path.join(process.cwd(), entryPoint)
)) as {
Expand Down Expand Up @@ -51,22 +60,37 @@ export const fetch = async (entryPoint: string, browserMode: boolean) => {
if (!fs.existsSync(hentaiDirectory))
await fs.promises.mkdir(hentaiDirectory, { recursive: true })

// let hasError
const idsNeedsToBeFetched = codes
.map(code => typeof code === 'object' ? code.code : code)
.map(code => Number(code))
.filter(code => {
return !fs.existsSync(path.join(hentaiDirectory, `${code}.json`))
})

console.log(`${idsNeedsToBeFetched.length} galleries needs to be fetched`)

const fetchResult = await (browserMode ? getGalleriesViaBrowser : getGalleriesViaFetch)(idsNeedsToBeFetched)
if (idsNeedsToBeFetched.length > 0) {
// look for hot-cache first
const mongoItems = await collections.galleries.find({
id: { $in: idsNeedsToBeFetched }
}, {
projection: { _id: 0 }
}).toArray()

await Promise.all(mongoItems.map(item =>
writeItem(item.id, item)
))

if (fetchResult.failure > 0) {
console.error("there's some error during fetching! crashing...")
process.exit(1)
} else {
console.log('fetched all galleries')
console.log(`${mongoItems.length} found in cache! ${idsNeedsToBeFetched.length - mongoItems.length} needs to be fetched further`)

const fetchResult = await (browserMode ? getGalleriesViaBrowser : getGalleriesViaFetch)(idsNeedsToBeFetched)

if (fetchResult.failure > 0) {
console.error("there's some error during fetching! crashing...")
process.exit(1)
} else {
console.log('fetched all galleries')
}
}

/**
Expand Down Expand Up @@ -102,5 +126,6 @@ export const fetch = async (entryPoint: string, browserMode: boolean) => {
path.join(cacheDirectory, 'searchKey.json'),
JSON.stringify(orderedHentai)
)

console.log('completed!')
}
18 changes: 0 additions & 18 deletions scraper/src/commands/seed.ts

This file was deleted.

81 changes: 34 additions & 47 deletions scraper/src/commands/sync.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,61 +4,48 @@ import path from 'path'
import { Hentai } from '@riffyh/commons'

import { hentaiDirectory } from '../constants/hentaiDirectory'
import { createDBConnection } from '../functions/createDBConnection'
import { jsonb } from '../functions/jsonb'
import { collections } from "../constants/mongo";

export const sync = async () => {
if (process.env.DATABASE_URL === undefined) {
if (process.env.MONGODB_URL === undefined) {
console.error(
'no database url provided, please provide postgres connection url'
)
process.exit(1)
} else {
const db = createDBConnection()
return process.exit(1)
}

try {
const remoteIds = (await collections.galleries.find({}, {
projection: { _id: 0, id: 1 },
}).toArray()).map(o => o.id)

try {
// list ids in local cache
const ids = await db
.selectFrom('Hentai')
.select('id')
.execute()
.then(o => o.map(p => p.id))
// push new data to local cache
const itemsToPush = fs
.readdirSync(hentaiDirectory)
.filter(o => !o.startsWith('.') && o.endsWith('.json'))
.filter(
o =>
!remoteIds.includes(
(
JSON.parse(
fs.readFileSync(path.join(hentaiDirectory, o), 'utf8')
) as Hentai
).id
)
)
.map(file => {
return JSON.parse(
fs.readFileSync(path.join(hentaiDirectory, file), 'utf8')
) as Hentai
})

// push new data to local cache
const itemsToPush = fs
.readdirSync(hentaiDirectory)
.filter(o => !o.startsWith('.') && o.endsWith('.json'))
.filter(
o =>
!ids.includes(
(
JSON.parse(
fs.readFileSync(path.join(hentaiDirectory, o), 'utf8')
) as Hentai
).id
)
)
.map(file => {
return JSON.parse(
fs.readFileSync(path.join(hentaiDirectory, file), 'utf8')
) as Hentai
})
console.log(itemsToPush.length + ' items to push')

console.log(itemsToPush.length + ' items to push')
if (itemsToPush.length > 0)
await collections.galleries.insertMany(itemsToPush)

await db
.insertInto('Hentai')
.values(
itemsToPush.map(item => ({
id: item.id,
data: jsonb(JSON.stringify(item)),
}))
)
.execute()
} catch (e) {
console.log(e)
} finally {
await db.destroy()
}
process.exit(0)
} catch (e) {
console.log(e)
}
}
20 changes: 20 additions & 0 deletions scraper/src/constants/mongo.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import { MongoClient } from 'mongodb'

import type { Hentai } from "@riffyh/commons";

const globalMongo = global as unknown as {
mongo?: MongoClient
}

export const mongo = globalMongo.mongo || new MongoClient(Bun.env.MONGODB_URL!)

globalMongo.mongo = mongo

// We can call `.db` and `.collection` as much as we like.
// Until we actually make a query, it won’t connect to the database.

const db = mongo.db('riffyh-data')

export const collections = {
galleries: db.collection<Hentai>('nh-galleries'),
}
11 changes: 0 additions & 11 deletions scraper/src/functions/createDBConnection.ts

This file was deleted.

2 changes: 0 additions & 2 deletions scraper/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ import { hideBin } from 'yargs/helpers'

import { fetch } from './commands/fetch'
import { sync } from './commands/sync'
import { seed } from './commands/seed'

yargs(hideBin(process.argv))
.command<{ file: string, browser: boolean }>(
Expand All @@ -21,6 +20,5 @@ yargs(hideBin(process.argv))
argv => fetch(argv.file, argv.browser)
)
.command('sync', 'sync data to database', () => {}, sync)
.command('seed', 'generate cache table on database', () => {}, seed)
.demandCommand(1)
.parse()

0 comments on commit 6243b59

Please sign in to comment.