Skip to content

Commit

Permalink
feat: browser-mode scraping
Browse files Browse the repository at this point in the history
  • Loading branch information
rayriffy committed Feb 14, 2025
1 parent e158abb commit 28a8594
Show file tree
Hide file tree
Showing 8 changed files with 160 additions and 31 deletions.
Binary file removed bun.lockb
Binary file not shown.
3 changes: 3 additions & 0 deletions scraper/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@
"kysely": "0.27.4",
"p-queue": "8.0.1",
"pg": "8.13.1",
"puppeteer": "24.1.1",
"puppeteer-extra": "3.3.6",
"puppeteer-extra-plugin-stealth": "2.11.2",
"yargs": "17.7.2"
},
"devDependencies": {
Expand Down
4 changes: 4 additions & 0 deletions scraper/src/@types/FetchResult.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
export interface FetchResult {
success: number
failure: number
}
49 changes: 19 additions & 30 deletions scraper/src/commands/fetch.ts
Original file line number Diff line number Diff line change
@@ -1,22 +1,16 @@
import fs from 'fs'
import path from 'path'

import PQueue from 'p-queue'
import { DatabaseCode, Hentai, itemsPerPage } from '@riffyh/commons'

import { chunk } from '../constants/chunk'
import { cacheDirectory } from '../constants/cacheDirectory'
import { Kysely } from 'kysely'
import { SQLDatabase } from '../@types/SQLDatabase'
import { createDBConnection } from '../functions/createDBConnection'
import { getRemoteHentai } from '../functions/getRemoteHentai'
import {parseUrl} from "../functions/parseUrl";
import { hentaiDirectory } from '../constants/hentaiDirectory'
import { parseUrl } from "../functions/parseUrl";
import {getGalleriesViaBrowser} from "../functions/getGalleriesViaBrowser";
import {getGalleriesViaFetch} from "../functions/getGalleriesViaFetch";

const fetchQueue = new PQueue({
concurrency: 8,
})

export const fetch = async (entryPoint: string) => {
export const fetch = async (entryPoint: string, browserMode: boolean) => {
const { default: codes } = (await import(
path.join(process.cwd(), entryPoint)
)) as {
Expand Down Expand Up @@ -48,34 +42,27 @@ export const fetch = async (entryPoint: string) => {
})
)

console.log(`${chunks.length} chunks generated`)

/**
* Step 2: Fetch all items
*/
console.log('fetching all galleries')
const hentaiDirectory = path.join(cacheDirectory, 'hentai')

console.log('fetching galleries...')
if (!fs.existsSync(hentaiDirectory))
await fs.promises.mkdir(hentaiDirectory, { recursive: true })

let hasError
let db: Kysely<SQLDatabase> | undefined

if (process.env.DATABASE_URL !== undefined) db = createDBConnection()
// let hasError
const idsNeedsToBeFetched = codes
.map(code => typeof code === 'object' ? code.code : code)
.filter(code => {
return !fs.existsSync(path.join(hentaiDirectory, `${code}.json`))
})

await Promise.all(
codes.map(code =>
fetchQueue.add(() =>
getRemoteHentai(code, db).catch(() => {
hasError = true
})
)
)
)
console.log(`${idsNeedsToBeFetched.length} galleries needs to be fetched`)

await fetchQueue.onIdle()
await db?.destroy()
const fetchResult = await (browserMode ? getGalleriesViaBrowser : getGalleriesViaFetch)(idsNeedsToBeFetched)

if (hasError) {
if (fetchResult.failure > 0) {
console.error("there's some error during fetching! crashing...")
process.exit(1)
} else {
Expand All @@ -85,6 +72,7 @@ export const fetch = async (entryPoint: string) => {
/**
* Create search keys for searching
*/
console.log('generating search keys...')
const orderedHentai = codes
.map(code => {
try {
Expand Down Expand Up @@ -114,4 +102,5 @@ export const fetch = async (entryPoint: string) => {
path.join(cacheDirectory, 'searchKey.json'),
JSON.stringify(orderedHentai)
)
console.log('completed!')
}
74 changes: 74 additions & 0 deletions scraper/src/functions/getGalleriesViaBrowser.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import puppeteer from 'puppeteer-extra'
import StealthPlugin from 'puppeteer-extra-plugin-stealth'
import PQueue from "p-queue";
import type { DatabaseCode } from "@riffyh/commons/dist";
import type { Browser } from "puppeteer";
import { writeItem } from "./writeItem";
import type { FetchResult } from "../@types/FetchResult";

const fetchQueue = new PQueue({
concurrency: 8,
})

export const getGalleriesViaBrowser = async (codes: (string | number)[]): Promise<FetchResult> => {
const [firstGallery, ...galleries] = codes

let success = 0
let failure = 0

if (codes.length === 0)
return {
success,
failure
}

puppeteer.use(StealthPlugin())
const browser = await puppeteer
.use(StealthPlugin())
.launch({
headless: false,
defaultViewport: {
width: 1190,
height: 700
},
targetFilter: target => target.type() !== 'other'
})

// get first by itself first
await getItem(firstGallery, browser)
.then(() => success++)
.catch(() => failure++)
await Promise.all(
galleries.map(gallery =>
fetchQueue.add(() =>
getItem(gallery, browser)
.then(() => success++)
.catch(() => failure++)
)
)
)

await fetchQueue.onIdle()
await browser.close()

return {
success,
failure
}
}

const getItem = async (code: string | number, browser: Browser) => {
const page = await browser.newPage()
await page.goto(`https://nhentai.net/api/gallery/${code}`, {
waitUntil: 'networkidle0',
})

// if hit cloudflare, then maybe wait faster?
if ((await page.title()).toLowerCase().includes('just a moment'))
await page.waitForNetworkIdle()

const content = await page.$eval('pre', el => el.textContent) ?? ''
await page.close()

await writeItem(code, content)
}
36 changes: 36 additions & 0 deletions scraper/src/functions/getGalleriesViaFetch.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import PQueue from "p-queue";
import type { DatabaseCode } from "@riffyh/commons";
import { fetchHentai } from "./fetchHentai";
import type { FetchResult } from "../@types/FetchResult";

const fetchQueue = new PQueue({
concurrency: 8,
})

export const getGalleriesViaFetch = async (codes: (string | number)[]): Promise<FetchResult> => {
let success = 0
let failure = 0

if (codes.length === 0)
return {
success,
failure
}

await Promise.all(
codes.map(code =>
fetchQueue.add(() =>
fetchHentai(code)
.then(() => success++)
.catch(() => failure++)
)
)
)

await fetchQueue.onIdle()

return {
success,
failure
}
}
12 changes: 12 additions & 0 deletions scraper/src/functions/writeItem.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import fs from "fs";
import path from "path";
import type { Hentai } from "@riffyh/commons";
import { hentaiDirectory } from "../constants/hentaiDirectory";

export const writeItem = (code: string | number, item: Hentai | string) => {
const file = path.join(hentaiDirectory, `${code}.json`)
return fs.promises.writeFile(
file,
typeof item === 'string' ? item : JSON.stringify(item)
)
}
13 changes: 12 additions & 1 deletion scraper/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,18 @@ import { sync } from './commands/sync'
import { seed } from './commands/seed'

yargs(hideBin(process.argv))
.command<{ file: string }>('fetch <file>', 'fetch file from raw source', () => {}, argv => fetch(argv.file))
.command<{ file: string, browser: boolean }>(
'fetch <file>', 'fetch file from raw source',
yargs => {
yargs.option('browser', {
type: 'boolean',
describe:
'fetch using full-browser mode in case of cloudflare protection turned on',
default: false,
});
},
argv => fetch(argv.file, argv.browser)
)
.command('sync', 'sync data to database', () => {}, sync)
.command('seed', 'generate cache table on database', () => {}, seed)
.demandCommand(1)
Expand Down

0 comments on commit 28a8594

Please sign in to comment.