Skip to content

Commit

Permalink
Optimize scrapin (a lot)
Browse files Browse the repository at this point in the history
  • Loading branch information
frixaco committed Nov 5, 2024
1 parent a6752fc commit 8abb9d5
Show file tree
Hide file tree
Showing 19 changed files with 2,193 additions and 5,681 deletions.
37 changes: 0 additions & 37 deletions .github/workflows/preview.yml

This file was deleted.

37 changes: 0 additions & 37 deletions .github/workflows/production.yml

This file was deleted.

Empty file removed .npmrc
Empty file.
5 changes: 0 additions & 5 deletions apps/api-go/payload.json

This file was deleted.

2 changes: 2 additions & 0 deletions apps/api/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# deps
node_modules/
11 changes: 11 additions & 0 deletions apps/api/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
To install dependencies:
```sh
bun install
```

To run:
```sh
bun run dev
```

open http://localhost:3000
Binary file added apps/api/bun.lockb
Binary file not shown.
1,977 changes: 1,977 additions & 0 deletions apps/api/details.html

Large diffs are not rendered by default.

20 changes: 20 additions & 0 deletions apps/api/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"name": "api",
"private": true,
"scripts": {
"dev": "bun run --hot src/index.ts"
},
"dependencies": {
"drizzle-orm": "^0.36.0",
"hono": "^4.6.8",
"jsdom": "^25.0.1",
"pg": "^8.13.1",
"zod": "^3.23.8"
},
"devDependencies": {
"@types/bun": "latest",
"@types/jsdom": "^21.1.7",
"@types/pg": "^8.11.10",
"drizzle-kit": "^0.27.1"
}
}
5 changes: 5 additions & 0 deletions apps/api/payload.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"userId": "user_2fJE3fogkq7ScFnUojOlAlSyzeZ",
"hianimeUrl": "https://hianime.to/watch/blue-lock-season-2-19318?ep=128447",
"nyaasiUrl": "https://nyaa.si/?f=0&c=0_0&q=blue+lock+season+2"
}
165 changes: 165 additions & 0 deletions apps/api/src/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
import { Context, Hono } from "hono";
import { cors } from "hono/cors";
import { logger } from "hono/logger";

import { drizzle } from "drizzle-orm/node-postgres";
import { z } from "zod";

import { JSDOM } from "jsdom";

const db = drizzle(process.env.DATABASE_URL);

const app = new Hono();

app.use("*", cors());
app.use(logger());

const justInCaseBrowserHeaders = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
Accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
Connection: "keep-alive",
};

const schema = z.object({
userId: z.string(),
hianimeUrl: z.string(),
nyaasiUrl: z.string(),
});

const processHianimeUrl = async (
hianimeUrl: string
): Promise<{
data: {
episodes: {
url: string;
title: string;
episodeNumber: string;
episodeId: string;
}[];
title: string;
titleId: string;
thumbnailUrl: string | null;
} | null;
error: string | null;
}> => {
/**
* Supported hianime URL formats:
* - https://hianime.to/watch/blue-lock-season-2-19318?ep=128447
* - https://hianime.to/watch/blue-lock-season-2-19318?ep=128447&ep=128447
* - https://hianime.to/blue-lock-season-2-19318
*/
const getTitleId = (url: string) => {
const pathPart = url.split("?")[0];
const id = pathPart.split("-").pop();

return id;
};
const titleId = getTitleId(hianimeUrl);
if (!titleId) {
return {
data: null,
error: "No title ID found",
};
}

const res = await fetch(
`https://hianime.to/ajax/v2/episode/list/${titleId}`,
{
headers: justInCaseBrowserHeaders,
}
);
if (!res.ok) {
return {
data: null,
error: "Failed to fetch episodes",
};
}
const data: { html: string; status: boolean } = await res.json();

const dom = new JSDOM(data.html);
const doc = dom.window.document;

const episodes: {
url: string;
title: string;
episodeNumber: string;
episodeId: string;
}[] = [];
const episodeElements = doc.querySelectorAll(".ssl-item.ep-item");

episodeElements.forEach((element) => {
const href = element.getAttribute("href");
const title = element.getAttribute("title");
const episodeNumber = element.getAttribute("data-number");
const episodeId = element.getAttribute("data-id");

if (!href || !title || !episodeNumber || !episodeId) {
console.error(
"No href or title found for an episode: ",
element.outerHTML
);
return {
data: null,
error: "Missing data for an episode",
};
}

episodes.push({
url: `https://hianime.to${href}`,
title,
episodeNumber,
episodeId,
});
});

const detailsRes = await fetch(hianimeUrl, {
headers: justInCaseBrowserHeaders,
});
const detailsHTML = await detailsRes.text();
const detailsDom = new JSDOM(detailsHTML);
const detailsDoc = detailsDom.window.document;

const thumbnailUrl = detailsDoc
.querySelector(".anis-cover")
?.getAttribute("style")
?.split("url(")[1]
.split(")")[0];

const titleElement = detailsDoc.querySelector(".os-item.active");
if (!titleElement) {
return {
data: null,
error: "No title element found",
};
}
const title = titleElement.getAttribute("title");
if (!title) {
return {
data: null,
error: "No title found",
};
}

return {
data: { episodes, title, titleId, thumbnailUrl: thumbnailUrl ?? null },
error: null,
};
};

app.post("/scrape", async (c) => {
const body = await c.req.json();
const { userId, hianimeUrl, nyaasiUrl } = schema.parse(body);

const { data, error } = await processHianimeUrl(hianimeUrl);
if (error || !data) {
return c.json({ error }, 500);
}
const { episodes, title, titleId, thumbnailUrl } = data;

return c.json({ episodes, title, titleId, thumbnailUrl });
});

export default app;
7 changes: 7 additions & 0 deletions apps/api/tsconfig.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"compilerOptions": {
"strict": true,
"jsx": "react-jsx",
"jsxImportSource": "hono/jsx"
}
}
Binary file added bun.lockb
Binary file not shown.
24 changes: 6 additions & 18 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,23 +1,11 @@
{
"name": "anitrack",
"private": true,
"scripts": {
"build": "turbo build",
"dev": "turbo watch dev",
"start": "turbo start",
"web:dev": "turbo watch dev --filter=web",
"api:dev": "turbo watch dev --filter=api-go",
"lint": "turbo lint --filter=web",
"lint-fix": "turbo lint-fix --filter=web",
"format-check": "turbo format-check --filter=web",
"format-fix": "turbo format-fix --filter=web"
},
"scripts": {},
"workspaces": [
"apps/*"
],
"devDependencies": {
"prettier": "^3",
"turbo": "^2.0.11"
},
"engines": {
"node": ">=20"
},
"packageManager": "[email protected]"
"prettier": "^3"
}
}
1 change: 0 additions & 1 deletion packages/README.md

This file was deleted.

Loading

0 comments on commit 8abb9d5

Please sign in to comment.