From 100c3e9b4a824d2f7933760988587564a6dfea5f Mon Sep 17 00:00:00 2001 From: Adam Schwartz Date: Thu, 10 Oct 2019 19:06:13 -0400 Subject: [PATCH] initial commit --- .gitignore | 2 + README.md | 124 +++++++++++++++ html.js | 432 ++++++++++++++++++++++++++++++++++++++++++++++++++ index.js | 107 +++++++++++++ package.json | 9 ++ wrangler.toml | 4 + 6 files changed, 678 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 html.js create mode 100644 index.js create mode 100644 package.json create mode 100644 wrangler.toml diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9367e4f --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +worker/ +dist/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..19faa5f --- /dev/null +++ b/README.md @@ -0,0 +1,124 @@ +# Web Scraper + +Web Scraper makes it effortless to scrape websites. You provide a URL and CSS selector and it will return you JSON containing the text contents of the matching elements. + +- [Website](http://web.scraper.workers.dev) + +## Examples + +### Heading from example.com + +[`https://web.scraper.workers.dev/?url=example.com&selector=h1`](https://web.scraper.workers.dev/?url=example.com&selector=h1) + +```JSON +{"result":["Example Domain"]} +``` + +### Profile details from github.com profile page + +[`https://web.scraper.workers.dev/?url=https://github.com/adamschwartz&selector=.vcard-fullname,.d-md-block+[itemprop=worksFor],.d-md-block+[itemprop=homeLocation]&pretty=true`](https://web.scraper.workers.dev/?url=https://github.com/adamschwartz&selector=.vcard-fullname,.d-md-block+[itemprop=worksFor],.d-md-block+[itemprop=homeLocation]&pretty=true) + +```JSON +{ + "result": { + ".vcard-fullname": [ + "Adam Schwartz" + ], + ".d-md-block [itemprop=worksFor]": [ + "@cloudflare" + ], + ".d-md-block [itemprop=homeLocation]": [ + "Boston, MA" + ] + } +} +``` + +### Random quote/author from quotes.net + +[`https://web.scraper.workers.dev/?url=https://www.quotes.net/random.php&selector=%23disp-quote-body,.author&pretty=true`](https://web.scraper.workers.dev/?url=https://www.quotes.net/random.php&selector=%23disp-quote-body,.author&pretty=true) + +``` +{ + "result": { + "#disp-quote-body": [ + "We are advertis'd by our loving friends." + ], + ".author": [ + "William Shakespeare" + ] + } +} +``` + +## API + + - Requests are made as `GET` against `http://web.scraper.workers.dev`. + - There are two required query params, `url` and `selector`. + - There are two optional query params, `pretty` and `spaced`. + +
http://web.scraper.workers.dev
+  ?url=https://example.com
+  &selector=p
+  &pretty=true
+  &spaced=true
+ +### Query params + +#### `url` (required) + + - Supports `https://` and `http://` protocols. + - If a protocol isn’t found, `http://` is prepended. + - e.g. `https://web.scraper.workers.dev/?url=example.com&selector=p` + +#### `selector` (required) + + - Supports the same set of CSS selectors as Cloudflare Workers' [`HTMLRewriter` class](https://developers.cloudflare.com/workers/reference/apis/html-rewriter/#selectors) + - As of Oct 10, 2019, this includes: + - `*` – any element + - `E` – any element of type E + - `E:not(s)` – an E element that does not match either compound selector s + - `E.warning` – an E element belonging to the class warning + - `E#myid` – an E element with ID equal to myid. + - `E[foo]` – an E element with a foo attribute + - `E[foo="bar"]` – an E element whose foo attribute value is exactly equal to bar + - `E[foo="bar" i]` – an E element whose foo attribute value is exactly equal to any (ASCII-range) case-permutation of bar + - `E[foo="bar" s]` – an E element whose foo attribute value is exactly and case-sensitively equal to bar + - `E[foo~="bar"]` – an E element whose foo attribute value is a list of whitespace-separated values, one of which is exactly equal to bar + - `E[foo^="bar"]` – an E element whose foo attribute value begins exactly with the string bar + - `E[foo$="bar"]` – an E element whose foo attribute value ends exactly with the string bar + - `E[foo*="bar"]` – an E element whose foo attribute value contains the substring bar + - `E[foo|="en"]` – an E element whose foo attribute value is a hyphen-separated list of values beginning with en + - `E F` – an F element descendant of an E element + - `E > F` – an F element child of an E element + - Supports multiple selectors delimited with a comma. + +#### `pretty` (optional) + + - When `false` or not included, JSON is minified. + - When `true`, formats the JSON using `JSON.stringify(json, null, 2)`. + +#### `spaced` (optional) + + - When `false` or not included, the text nodes of children of the nodes matching selector will be concatenated just as they are. + - When `true`, a single space character is added between the end tag of a child. + +##### Examples + +Consider the following DOM structure: + +```

This is the first paragraph.

This is another paragraph.

``` + +If the `selector` is set to match `div`, by default the resulting text will be: + +```This is the first paragraph.This is another paragraph.``` + +This is because there is no space character between `

` and `

`. + +With `spaced` set to `true`, the result is: + +```This is the first paragraph.This is another paragraph.``` + +## Author + +Web Scraper was created by [Adam Schwartz](https://adamschwartz.co). diff --git a/html.js b/html.js new file mode 100644 index 0000000..e50df9b --- /dev/null +++ b/html.js @@ -0,0 +1,432 @@ +export default ` + + + + + + + Web Scraper · By Adam Schwartz · Powered by Cloudflare Workers® + + + + + + + + + + + + + + + + + + + + + + + + + + +

+
+ + +
+
+
+ + +
+ +
+ + +
+
+
+ +
+
+ + Permalink +
+
+
+ +
+
+
+
+ +
+
+ +
+
+ + + +` \ No newline at end of file diff --git a/index.js b/index.js new file mode 100644 index 0000000..2e16759 --- /dev/null +++ b/index.js @@ -0,0 +1,107 @@ +import html from './html.js' + +const contentTypes = { + html: 'text/html;charset=UTF-8', + json: 'application/json;charset=UTF-8' +} + +const cleanText = s => s.trim().replace(/\s\s+/g, ' ') + +addEventListener('fetch', event => { + event.respondWith(handleRequest(event.request)) +}) + +async function handleRequest(request) { + const searchParams = new URL(request.url).searchParams + + let url = searchParams.get('url') + if (url && !url.match(/^[a-zA-Z]+:\/\//)) url = 'http://' + url + + const selector = searchParams.get('selector') + const pretty = searchParams.get('pretty') + const spaced = searchParams.get('spaced') // Adds spaces between tags + + if (url && selector) { + const response = await fetch(url) + + const rewriter = new HTMLRewriter() + + const matches = {} + const selectors = selector.split(',').map(s => s.trim()) + + selectors.forEach((selector) => { + matches[selector] = [] + + let nextText = '' + + rewriter.on(selector, { + element(element) { + matches[selector].push(true) + nextText = '' + }, + + text(text) { + nextText += text.text + + if (text.lastInTextNode) { + if (spaced) nextText += ' ' + matches[selector].push(nextText) + nextText = '' + } + } + }) + }) + + const transformed = rewriter.transform(response) + + await transformed.text() + + selectors.forEach((selector) => { + const nodeCompleteTexts = [] + + let nextText = '' + + matches[selector].forEach(text => { + if (text === true) { + if (nextText.trim() !== '') { + nodeCompleteTexts.push(cleanText(nextText)) + nextText = '' + } + } else { + nextText += text + } + }) + + const lastText = cleanText(nextText) + if (lastText !== '') nodeCompleteTexts.push(lastText) + matches[selector] = nodeCompleteTexts + }) + + const json = { + result: selectors.length === 1 ? matches[selectors[0]] : matches + } + + const jsonString = JSON.stringify(json, null, pretty ? 2 : 0) + + return new Response(jsonString, { + headers: { 'content-type': contentTypes.json } + }) + + } else { + const url = new URL(request.url) + + if (url.pathname === '/' || url.pathname === '') { + return new Response(html, { + headers: { 'content-type': contentTypes.html } + }) + + } else { + // TODO - handle other paths? + return new Response('Not found', { status: 404 }) + } + + addEventListener('fetch', event => { + event.respondWith(handleRequest(event.request)) + }) + } +} diff --git a/package.json b/package.json new file mode 100644 index 0000000..963ec34 --- /dev/null +++ b/package.json @@ -0,0 +1,9 @@ +{ + "private": true, + "name": "Workers Web Scraper", + "version": "1.0.0", + "description": "Web scraper built with Cloudflare Workers utilizing HTMLRewriter.", + "main": "index.js", + "author": "Adam Schwartz ", + "license": "MIT" +} diff --git a/wrangler.toml b/wrangler.toml new file mode 100644 index 0000000..3b437c9 --- /dev/null +++ b/wrangler.toml @@ -0,0 +1,4 @@ +name = "web" +account_id = "2575000dd90a08dbbeb2b8cca2e64775" # scraper.workers.dev +type = "webpack" +workers_dev = true