Skip to content

Commit

Permalink
Merge pull request #102 from the-markup/output-expansion
Browse files Browse the repository at this point in the history
Output args, refactoring
  • Loading branch information
BatMiles committed Sep 12, 2024
2 parents 0e80212 + d4775a5 commit 4d30631
Show file tree
Hide file tree
Showing 4 changed files with 253 additions and 36 deletions.
202 changes: 202 additions & 0 deletions __tests__/pptr-utils.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
import { dedupLinks, getSocialLinks } from '../src/pptr-utils/get-links';
import { SOCIAL_URLS } from '../src/pptr-utils/default';
import { LinkObject } from '../src/types';

describe('get-links', () => {
describe('dedupLinks', () => {
it('removes duplicates from an array of links', () => {
const links:LinkObject[] = [
{
href: "www.url1.com",
innerHtml: "html",
innerText: "Hello, world"
},
{
href: "www.url1.com",
innerHtml: "html",
innerText: "Hello, world"
},
{
href: "www.url2.com",
innerHtml: "html2",
innerText: "bah bah black sheep"
}
];
const result = dedupLinks(links);
expect(result.length).toBe(2);
expect(result).toStrictEqual([
{
href: "www.url1.com",
innerHtml: "html",
innerText: "Hello, world"
},
{
href: "www.url2.com",
innerHtml: "html2",
innerText: "bah bah black sheep"
}
]);
});
it('returns original array of links if no duplicates', () => {
const links: LinkObject[] = [
{
href: "www.url1.com",
innerHtml: "html",
innerText: "Hello, world"
},
{
href: "www.url2.com",
innerHtml: "html2",
innerText: "bah bah black sheep"
}
];
const result = dedupLinks(links);
expect(result).toHaveLength(2);
expect(result).toStrictEqual(links);
});
});

describe('getSocialLinks', () => {
it('filters out only social links from a list of LinkObjects', () => {
const links:LinkObject[] = [
{
href: 'url1.com',
innerHtml: 'html',
innerText: 'hello world!'
},
{
href: 'www.facebook.com',
innerHtml: 'facebook',
innerText: 'fb'
},
{
href: 'www.x.com',
innerHtml: 'x',
innerText: 'x'
}
];
const results = getSocialLinks(links);
expect(results).toHaveLength(2);
expect(results).toStrictEqual([
{
href: 'www.facebook.com',
innerHtml: 'facebook',
innerText: 'fb'
} ,
{
href: 'www.x.com',
innerHtml: 'x',
innerText: 'x'
}
]);
});
it('doesn\'t recognize urls ending in a social url', () => {
const links = [
{
href: 'www.x.com',
innerHtml: 'x',
innerText: 'x'
},
{
href: 'x.com',
innerHtml: 'x',
innerText: 'x'
},
{
href: 'www.fix.com',
innerHtml: 'fix',
innerText: 'fix'
},
{
href: 'fix.com',
innerHtml: 'fix',
innerText: 'fix'
}
];
const results = getSocialLinks(links);
expect(results).toHaveLength(2);
expect(results).toStrictEqual([
{
href: 'www.x.com',
innerHtml: 'x',
innerText: 'x'
},
{
href: 'x.com',
innerHtml: 'x',
innerText: 'x'
},
]);
});
it('returns original array if it contains only social links', () => {
const links = [
{
href: 'www.evernote.com',
innerHtml: 'evernote',
innerText: 'evernote'
},
{
href: 'www.tiktok.com',
innerHtml: 'tiktok',
innerText: 'tiktok'
}
];
const result = getSocialLinks(links);
expect(result).toHaveLength(links.length);
expect(result).toStrictEqual(links);
});
it('recognizes every social link', () => {
const links:LinkObject[] = SOCIAL_URLS.map(url => ({ href: `www.${url}`, innerText: 'text', innerHtml: 'html'}));
const result = getSocialLinks(links);
expect(result).toHaveLength(links.length);
expect(result).toStrictEqual(links);
});
it('recognizes different versions of a social link', () => {
const links = [
{
href: 'snapchat.com',
innerHtml: 'snapchat',
innerText: 'snapchat'
},
{
href: 'www.snapchat.com',
innerHtml: 'snapchat',
innerText: 'snapchat'
},
{
href: 'http://snapchat.com',
innerHtml: 'snapchat',
innerText: 'snapchat'
},
{
href: 'https://snapchat.com',
innerHtml: 'snapchat',
innerText: 'snapchat'
},
{
href: 'http://www.snapchat.com',
innerHtml: 'snapchat',
innerText: 'snapchat'
},
{
href: 'snapchat.com/page',
innerHtml: 'snapchat',
innerText: 'snapchat'
},
{
href: 'www.snapchat.com/page',
innerHtml: 'snapchat',
innerText: 'snapchat'
},
{
href: 'subdomain.snapchat.com/page',
innerHtml: 'snapchat',
innerText: 'snapchat'
}
];
const results = getSocialLinks(links);
expect(results).toHaveLength(links.length);
expect(results).toStrictEqual(links);
});
});
});
12 changes: 10 additions & 2 deletions src/collector.ts
Original file line number Diff line number Diff line change
Expand Up @@ -59,18 +59,26 @@ export const collect = async (inUrl: string, args: CollectorOptions) => {
const logger = getLogger({ outDir: args.outDir, quiet: args.quiet });

const output: any = {
args: args.title,
title: args.title,
uri_ins: inUrl,
uri_dest: null,
uri_redirects: null,
secure_connection: {},
host: new URL(inUrl).hostname,
config: {
emulateDevice: args.emulateDevice,
cleareCache: args.clearCache,
captureHar: args.captureHar,
captureLinks: args.captureLinks,
enableAdBlock: args.enableAdBlock,
numPages: args.numPages
saveBrowserProfile: args.saveBrowserProfile,
numPages: args.numPages,
defaultTimeout: args.defaultTimeout,
defaultWaitUntil: args.defaultWaitUntil,
headless: args.headless,
headers: args.headers,
extraChromiumArgs: args.extraChromiumArgs,
extraPuppeteerOptions: args.extraPuppeteerOptions,
},
browser: null,
script: {
Expand Down
33 changes: 33 additions & 0 deletions src/pptr-utils/default.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,36 @@ export const defaultPuppeteerBrowserOptions = {
defaultViewport: null,
headless: true
};

export const SOCIAL_URLS = [
'facebook.com',
'linkedin.com',
'twitter.com',
'youtube.com',
'instagram.com',
'flickr.com',
'tumblr.com',
'snapchat.com',
'whatsapp.com',
'docs.google.com',
'goo.gl',
'pinterest.com',
'bit.ly',
'evernote.com',
'eventbrite.com',
'dropbox.com',
'slideshare.net',
'vimeo.com',
'x.com',
'bsky.app',
'tiktok.com',
'mastodon.social',
'threads.net',
'wechat.com',
'messenger.com',
'telegram.org',
'douyin.com',
'kuaishou.com',
'weibo.com',
'im.qq.com',
];
42 changes: 8 additions & 34 deletions src/pptr-utils/get-links.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { LinkObject } from '../types';
import { hasOwnProperty } from '../utils';
import { SOCIAL_URLS } from './default';

export const getLinks = async (page): Promise<LinkObject[]> => {
return page.evaluate(() => {
Expand All @@ -25,41 +26,14 @@ export const getLinks = async (page): Promise<LinkObject[]> => {
});
};

// https://dev.to/vuevixens/removing-duplicates-in-an-array-of-objects-in-js-with-sets-3fep
export const dedupLinks = (links_with_duplicates: LinkObject[]) => {
const sanitizedLinks = links_with_duplicates.filter(f => f && hasOwnProperty(f, 'href'));
const dedupedLinkArray = Array.from(new Set(sanitizedLinks));
// I don't think the bellow modification actually does anything,
// but I'm gonna write tests for this function before pulling the plug
const links = dedupedLinkArray
.map((link:LinkObject) => link.href)
.map(href => {
return links_with_duplicates.find(link => link.href === href);
});
return links;
};
// Uses Set to remove duplicates by reducing LinkObjects to their href property, deduping via Set,
// then reconstituting an array of full LinkObjects
export const dedupLinks = (links_with_duplicates: LinkObject[]):LinkObject[] => {
const sanitized_links = links_with_duplicates.filter(f => f && hasOwnProperty(f, 'href')).map(link => link.href);
const deduped_href_array = Array.from(new Set(sanitized_links));

const SOCIAL_URLS = [
'facebook.com',
'linkedin.com',
'twitter.com',
'youtube.com',
'instagram.com',
'flickr.com',
'tumblr.com',
'snapchat.com',
'whatsapp.com',
'docs.google.com',
'goo.gl',
'pinterest.com',
'bit.ly',
'plus.google.com',
'evernote.com',
'eventbrite.com',
'dropbox.com',
'slideshare.net',
'vimeo.com'
];
return deduped_href_array.map(href => links_with_duplicates.find(link => link.href === href));
};

export const getSocialLinks = (links: LinkObject[]): LinkObject[] => {
const spRegex = new RegExp(`\\b(${SOCIAL_URLS.join('|')})\\b`, 'i');
Expand Down

0 comments on commit 4d30631

Please sign in to comment.