Skip to content

Commit

Permalink
Add scraping and all data
Browse files Browse the repository at this point in the history
  • Loading branch information
Ackuq committed Sep 17, 2023
1 parent 9fa85d7 commit acce828
Show file tree
Hide file tree
Showing 15 changed files with 10,551 additions and 37 deletions.
1,961 changes: 1,961 additions & 0 deletions packages/scrapers/.data/c.json

Large diffs are not rendered by default.

1,796 changes: 1,796 additions & 0 deletions packages/scrapers/.data/kd.json

Large diffs are not rendered by default.

1,574 changes: 1,574 additions & 0 deletions packages/scrapers/.data/l.json

Large diffs are not rendered by default.

1,032 changes: 1,032 additions & 0 deletions packages/scrapers/.data/m.json

Large diffs are not rendered by default.

670 changes: 670 additions & 0 deletions packages/scrapers/.data/mp.json

Large diffs are not rendered by default.

2,103 changes: 2,103 additions & 0 deletions packages/scrapers/.data/s.json

Large diffs are not rendered by default.

580 changes: 580 additions & 0 deletions packages/scrapers/.data/sd.json

Large diffs are not rendered by default.

651 changes: 651 additions & 0 deletions packages/scrapers/.data/v.json

Large diffs are not rendered by default.

6 changes: 4 additions & 2 deletions packages/scrapers/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@
"description": "",
"main": "index.js",
"scripts": {
"run": ""
"scrape": "ts-node src/index.ts",
"scrape:all": "ts-node src/index.ts -p all"
},
"devDependencies": {
"@partiguiden/eslint-config-base": "workspace:*",
"@partiguiden/tsconfig": "workspace:*",
"@types/node": "20.6.0",
"cheerio": "1.0.0-rc.12",
"pdfjs-dist": "3.10.111"
"pdfjs-dist": "3.10.111",
"ts-node": "^10.9.1"
}
}
97 changes: 79 additions & 18 deletions packages/scrapers/src/get-party-data.ts
Original file line number Diff line number Diff line change
@@ -1,18 +1,81 @@
import * as fs from "node:fs";
import scrapers from "./scrapers";
import type { ScraperResult } from "./scraper";

interface StoredData {
[url: string]: ScraperResult & {
fetchDate: string;
subject?: string;
};
}

const DATA_DIRECTORY = ".data";

function writePartyData(
abbreviation: string,
newData: Record<string, ScraperResult>,
) {
const now = new Date();
const fileName = `${DATA_DIRECTORY}/${abbreviation.toLocaleLowerCase()}.json`;

if (!fs.existsSync(DATA_DIRECTORY)) {
fs.mkdirSync(DATA_DIRECTORY);
}
if (!fs.existsSync(fileName)) {
fs.writeFileSync(fileName, JSON.stringify(newData, null, 2));
return;
}
const storedData = JSON.parse(
fs.readFileSync(fileName).toString(),
) as StoredData;

// Delete removed data
const oldLinks = Object.keys(storedData);
const newLinks = Object.keys(newData);
const removedLinks = oldLinks.filter((link) => !(link in newData));
for (const link of removedLinks) {
delete storedData[link];
}
// Add new data
const addedLinks = newLinks.filter((link) => !(link in storedData));
for (const link of addedLinks) {
storedData[link] = {
...newData[link],
fetchDate: now.toISOString(),
subject: undefined,
};
}
// Mutate existing data
const existingLinks = newLinks.filter((link) => link in storedData);
for (const link of existingLinks) {
const incomingData = newData[link];
const result = { ...storedData[link] };
if (
JSON.stringify(incomingData.opinions) ===
JSON.stringify(result.opinions) &&
result.title === incomingData.title
) {
// Nothing has changed, continue
continue;
}
result.opinions = incomingData.opinions;
result.title = incomingData.title;
result.fetchDate = now.toISOString();
storedData[link] = result;
}
fs.writeFileSync(fileName, JSON.stringify(storedData, null, 2));
}

export interface GetPartyData {
abbreviation: string;
start?: number;
limit?: number;
preview?: boolean;
}

export default async function getPartyData({
abbreviation,
start,
limit,
preview,
}: GetPartyData) {
if (!Object.keys(scrapers).includes(abbreviation.toLowerCase())) {
throw new Error(`No scraper created for party ${abbreviation}`);
Expand All @@ -22,21 +85,19 @@ export default async function getPartyData({
scrapers[abbreviation.toLocaleLowerCase() as keyof typeof scrapers];

const data = await scraper.getPages(start, limit);
const keyedData = data.reduce(
(prev, current) => ({
...prev,
[current.url]: { ...current },
}),
{} as StoredData,
);

if (preview) {
console.log(`Number of entries: ${data.result.length}`);
console.log(
`Number of entries without content: ${
data.result.filter((entry) => entry.opinions.length === 0).length
}`,
);
if (!fs.existsSync(".scraper_out")) {
fs.mkdirSync(".scraper_out");
}
fs.writeFileSync(
`.scraper_out/${abbreviation.toLowerCase()}.json`,
JSON.stringify(data, null, 2),
);
}
return data;
console.log(`Number of entries: ${data.length}`);
console.log(
`Number of entries without content: ${
data.filter((entry) => entry.opinions.length === 0).length
}`,
);
writePartyData(abbreviation, keyedData);
}
2 changes: 0 additions & 2 deletions packages/scrapers/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,11 @@ if (party === "all") {
getPartyData({
abbreviation,
limit: limit ? parseInt(limit) : undefined,
preview: true,
});
});
} else {
getPartyData({
abbreviation: party,
limit: limit ? parseInt(limit) : undefined,
preview: true,
});
}
5 changes: 2 additions & 3 deletions packages/scrapers/src/party/sd-scraper.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,9 @@ export default class SDScraper extends Scraper {
"https://sd.se/wp-content/uploads/2022/07/sverigedemokraternas-valplattform-2022-april.pdf";
listPath = "";
listSelector = "";
opinionTags = [];
titleTag = "";

async getPages(): Promise<{ result: ScraperResult[]; hasMore: boolean }> {
async getPages(): Promise<ScraperResult[]> {
const pdf = await pdfjs.getDocument({
url: this.baseUrl,
useSystemFonts: true,
Expand Down Expand Up @@ -90,6 +89,6 @@ export default class SDScraper extends Scraper {
},
);
const dataWithStandpoints = await Promise.all(dataWithStandpointsPromises);
return { result: dataWithStandpoints, hasMore: false };
return dataWithStandpoints;
}
}
1 change: 0 additions & 1 deletion packages/scrapers/src/party/v-scraper.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ export default class VScraper extends Scraper {
baseUrl = "https://www.vansterpartiet.se";
listPath = "/var-politik/politik-a-o/";
listSelector = ".ModuleWrapper-module--component--W7iGr section a";
opinionTags = [];

protected getOpinions($: CheerioAPI): string[] {
const $articleBody = $("div.ArticleBody-module--component--f0xhF");
Expand Down
13 changes: 4 additions & 9 deletions packages/scrapers/src/scraper.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,14 @@ interface ScraperArgs {
listSelector: string;
absoluteUrls: boolean;
pathRegex?: RegExp;
opinionTags: string[];
opinionTags?: string[];
}

export default abstract class Scraper implements ScraperArgs {
abstract baseUrl: string;
abstract listPath: string;
abstract listSelector: string;
abstract opinionTags: string[];
opinionTags?: string[];
absoluteUrls: boolean = false;
pathRegex?: RegExp;

Expand Down Expand Up @@ -96,10 +96,7 @@ export default abstract class Scraper implements ScraperArgs {
};
}

async getPages(
start = 0,
limit?: number,
): Promise<{ result: ScraperResult[]; hasMore: boolean }> {
async getPages(start = 0, limit?: number): Promise<ScraperResult[]> {
const response = await fetch(this.baseUrl + this.listPath, {
headers: { "Content-Type": "text/plain; charset=UTF-8" },
});
Expand All @@ -109,9 +106,7 @@ export default abstract class Scraper implements ScraperArgs {

console.info(`Found ${$elements.length} list elements`);

let hasMore = false;
if (limit !== undefined) {
hasMore = start + limit <= $elements.length - 1;
$elements = $elements.slice(start, start + limit);
}

Expand All @@ -133,6 +128,6 @@ export default abstract class Scraper implements ScraperArgs {
},
)
.map((fulfilled) => fulfilled.value);
return { result: resolved, hasMore };
return resolved;
}
}
Loading

0 comments on commit acce828

Please sign in to comment.