forked from dochne/wappalyzer
-
Notifications
You must be signed in to change notification settings - Fork 26
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Store Technology meta data in BQ and icons in GCS (#73)
* dependency fix and update * uploads * roll back babel-eslint * upload icons * lint * full schema * fix name * service account auth * upload workflow * Merge main into bq-upload * icons upload tested * lint * typo
- Loading branch information
1 parent
dd1b4e9
commit 603be50
Showing
14 changed files
with
1,178 additions
and
258 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
name: Tests | ||
|
||
on: | ||
push: | ||
branches: | ||
- main | ||
paths: | ||
- "src/technologies/*.json" | ||
- "src/categories.json" | ||
- "src/groups.json" | ||
workflow_dispatch: | ||
|
||
jobs: | ||
test: | ||
name: Test and upload to GCP | ||
runs-on: ubuntu-latest | ||
steps: | ||
- name: Checkout | ||
uses: actions/checkout@v4 | ||
with: | ||
ref: ${{ github.event.pull_request.head.sha }} | ||
fetch-depth: 0 | ||
|
||
- name: Install dependencies | ||
run: yarn install | ||
|
||
- name: Validate | ||
run: yarn run validate | ||
|
||
- name: Run WebPageTest with unit tests | ||
id: unit-test | ||
env: | ||
WPT_SERVER: "webpagetest.httparchive.org" | ||
WPT_API_KEY: ${{ secrets.HA_API_KEY }} | ||
PR_NUMBER: ${{ github.event.pull_request.number }} | ||
run: yarn run test | ||
|
||
- name: Upload to GCP | ||
id: upload | ||
env: | ||
GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }} | ||
GCP_SA_KEY: ${{ secrets.GCP_SA_KEY }} | ||
run: | | ||
echo $GCP_SA_KEY > /tmp/gcp_key.json | ||
yarn run upload |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
/* eslint-disable no-console */ | ||
const fs = require('fs') | ||
const path = require('path') | ||
const { Storage } = require('@google-cloud/storage') | ||
|
||
// Configuration | ||
const BUCKET_NAME = 'technology_detections' | ||
const ICONS_DIR = path.resolve(__dirname, '../src/images/icons/converted') // Local directory where your PNG icons are stored | ||
|
||
const storage = new Storage({ | ||
keyFilename: '/tmp/gcp_key.json', | ||
}) | ||
|
||
async function syncIcons() { | ||
const bucket = storage.bucket(BUCKET_NAME) | ||
|
||
// Get list of files in the bucket | ||
const [filesInBucket] = await bucket.getFiles() | ||
const bucketFilesMap = new Map( | ||
filesInBucket.map((file) => [ | ||
file.name, | ||
new Date(file.metadata.updated).getTime(), | ||
]) | ||
) | ||
|
||
// Read all files from the local icons directory | ||
const localFiles = fs | ||
.readdirSync(ICONS_DIR) | ||
.filter((file) => file.endsWith('.png')) | ||
|
||
for (const file of localFiles) { | ||
const filePath = path.join(ICONS_DIR, file) | ||
const fileMetadata = fs.statSync(filePath) | ||
const fileInBucketUpdatedTime = bucketFilesMap.get(file) | ||
|
||
// Upload file if it's new or has been updated | ||
if ( | ||
!fileInBucketUpdatedTime || | ||
fileMetadata.mtime.getTime() > fileInBucketUpdatedTime | ||
) { | ||
try { | ||
await bucket.upload(filePath, { | ||
destination: 'icons/' + file, | ||
metadata: { | ||
contentType: 'image/png', | ||
}, | ||
}) | ||
console.log(`Uploaded: ${file}`) | ||
} catch (err) { | ||
console.error(`Error uploading file ${file}:`, err) | ||
} | ||
} else { | ||
console.log(`File already exists and is up to date: ${file}`) | ||
} | ||
} | ||
} | ||
|
||
syncIcons().catch(console.error) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,220 @@ | ||
/* eslint-disable no-console */ | ||
// A script to upload technologies and their categories to BigQuery. | ||
|
||
const fs = require('fs') | ||
const path = require('path') | ||
const { BigQuery } = require('@google-cloud/bigquery') | ||
|
||
const readJsonFiles = (directory) => { | ||
const files = fs.readdirSync(directory) | ||
return files.reduce((mergedData, file) => { | ||
const filePath = path.join(directory, file) | ||
const data = fs.readFileSync(filePath, 'utf8') | ||
return { ...mergedData, ...JSON.parse(data) } | ||
}, {}) | ||
} | ||
|
||
const getArray = (value) => | ||
typeof value === 'string' ? [value] : Array.isArray(value) ? value : [] | ||
|
||
const getRuleObject = (value) => { | ||
if (typeof value === 'string') { | ||
return [{ name: value, value: null }] | ||
} | ||
if (Array.isArray(value)) { | ||
return value.map((key) => ({ name: key, value: null })) | ||
} | ||
if (typeof value === 'object') { | ||
return Object.keys(value).map((key) => ({ | ||
name: key, | ||
value: | ||
typeof value[key] === 'object' | ||
? JSON.stringify(value[key]) | ||
: value[key].toString(), | ||
})) | ||
} | ||
return [] | ||
} | ||
|
||
const loadToBigQuery = async ( | ||
data, | ||
tableName = 'apps', | ||
datasetName = 'wappalyzer', | ||
writeDisposition = 'WRITE_TRUNCATE', | ||
sourceFormat = 'NEWLINE_DELIMITED_JSON' | ||
) => { | ||
if (!data) { | ||
throw new Error(`No data to load to \`${datasetName}.${tableName}\`.`) | ||
} | ||
|
||
const bigquery = new BigQuery({ | ||
keyFilename: '/tmp/gcp_key.json', | ||
}) | ||
const schema = { | ||
fields: [ | ||
{ name: 'name', type: 'STRING' }, | ||
{ name: 'categories', type: 'STRING', mode: 'REPEATED' }, | ||
{ name: 'website', type: 'STRING' }, | ||
{ name: 'description', type: 'STRING' }, | ||
{ name: 'icon', type: 'STRING' }, | ||
{ name: 'cpe', type: 'STRING' }, | ||
{ name: 'saas', type: 'BOOLEAN' }, | ||
{ name: 'oss', type: 'BOOLEAN' }, | ||
{ name: 'pricing', type: 'STRING', mode: 'REPEATED' }, | ||
{ name: 'implies', type: 'STRING', mode: 'REPEATED' }, | ||
{ name: 'requires', type: 'STRING', mode: 'REPEATED' }, | ||
{ name: 'requiresCategory', type: 'STRING', mode: 'REPEATED' }, | ||
{ name: 'excludes', type: 'STRING', mode: 'REPEATED' }, | ||
{ | ||
name: 'cookies', | ||
type: 'RECORD', | ||
mode: 'REPEATED', | ||
fields: [ | ||
{ name: 'name', type: 'STRING' }, | ||
{ name: 'value', type: 'STRING' }, | ||
], | ||
}, | ||
{ | ||
name: 'dom', | ||
type: 'RECORD', | ||
mode: 'REPEATED', | ||
fields: [ | ||
{ name: 'name', type: 'STRING' }, | ||
{ name: 'value', type: 'STRING' }, | ||
], | ||
}, | ||
{ | ||
name: 'dns', | ||
type: 'RECORD', | ||
mode: 'REPEATED', | ||
fields: [ | ||
{ name: 'name', type: 'STRING' }, | ||
{ name: 'value', type: 'STRING' }, | ||
], | ||
}, | ||
{ | ||
name: 'js', | ||
type: 'RECORD', | ||
mode: 'REPEATED', | ||
fields: [ | ||
{ name: 'name', type: 'STRING' }, | ||
{ name: 'value', type: 'STRING' }, | ||
], | ||
}, | ||
{ | ||
name: 'headers', | ||
type: 'RECORD', | ||
mode: 'REPEATED', | ||
fields: [ | ||
{ name: 'name', type: 'STRING' }, | ||
{ name: 'value', type: 'STRING' }, | ||
], | ||
}, | ||
{ name: 'text', type: 'STRING', mode: 'REPEATED' }, | ||
{ name: 'css', type: 'STRING', mode: 'REPEATED' }, | ||
{ | ||
name: 'probe', | ||
type: 'RECORD', | ||
mode: 'REPEATED', | ||
fields: [ | ||
{ name: 'name', type: 'STRING' }, | ||
{ name: 'value', type: 'STRING' }, | ||
], | ||
}, | ||
{ name: 'robots', type: 'STRING', mode: 'REPEATED' }, | ||
{ name: 'url', type: 'STRING', mode: 'REPEATED' }, | ||
{ name: 'xhr', type: 'STRING', mode: 'REPEATED' }, | ||
{ | ||
name: 'meta', | ||
type: 'RECORD', | ||
mode: 'REPEATED', | ||
fields: [ | ||
{ name: 'name', type: 'STRING' }, | ||
{ name: 'value', type: 'STRING' }, | ||
], | ||
}, | ||
{ name: 'scriptSrc', type: 'STRING', mode: 'REPEATED' }, | ||
{ name: 'script', type: 'STRING', mode: 'REPEATED' }, | ||
{ name: 'html', type: 'STRING', mode: 'REPEATED' }, | ||
], | ||
} | ||
|
||
const options = { schema, sourceFormat, writeDisposition } | ||
const [job] = await bigquery | ||
.dataset(datasetName) | ||
.table(tableName) | ||
.load(data, options) | ||
|
||
if (job.status.errors && job.status.errors.length > 0) { | ||
console.error('Errors encountered:', job.status.errors) | ||
throw new Error('Error loading data into BigQuery') | ||
} | ||
|
||
console.log( | ||
`Loaded ${job.numRowsLoaded} rows into ${datasetName}.${tableName}...` | ||
) | ||
} | ||
|
||
const main = async () => { | ||
const technologies = readJsonFiles('./src/technologies') | ||
const categories = JSON.parse( | ||
fs.readFileSync('./src/categories.json', 'utf8') | ||
) | ||
|
||
const transformedTechnologies = Object.keys(technologies).map((key) => { | ||
const app = { | ||
name: key, | ||
categories: technologies[key].cats.map( | ||
(category) => categories[category].name | ||
), | ||
} | ||
|
||
;[ | ||
'implies', | ||
'requires', | ||
'requiresCategory', | ||
'excludes', | ||
'text', | ||
'css', | ||
'robots', | ||
'url', | ||
'xhr', | ||
'scriptSrc', | ||
'script', | ||
'html', | ||
].forEach((field) => { | ||
app[field] = getArray(technologies[key][field]) | ||
}) | ||
;['cookies', 'dom', 'dns', 'js', 'headers', 'probe', 'meta'].forEach( | ||
(field) => { | ||
app[field] = getRuleObject(technologies[key][field]) | ||
} | ||
) | ||
;[ | ||
'website', | ||
'description', | ||
'icon', | ||
'cpe', | ||
'saas', | ||
'oss', | ||
'pricing', | ||
].forEach((field) => { | ||
app[field] = technologies[key][field] | ||
}) | ||
|
||
return app | ||
}) | ||
|
||
const transformedTechnologiesJsonL = transformedTechnologies | ||
.map((line) => JSON.stringify(line)) | ||
.join('\n') | ||
const filePath = './transformedTechnologies.jsonl' | ||
fs.writeFileSync(filePath, transformedTechnologiesJsonL) | ||
|
||
await loadToBigQuery(filePath, 'apps') | ||
|
||
// cleanup file | ||
fs.unlinkSync(filePath) | ||
} | ||
|
||
main().catch(console.error) |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.