Skip to content

Commit

Permalink
Store Technology meta data in BQ and icons in GCS (#73)
Browse files Browse the repository at this point in the history
* dependency fix and update

* uploads

* roll back babel-eslint

* upload icons

* lint

* full schema

* fix name

* service account auth

* upload workflow

* Merge main into bq-upload

* icons upload tested

* lint

* typo
  • Loading branch information
max-ostapenko authored Nov 18, 2024
1 parent dd1b4e9 commit 603be50
Show file tree
Hide file tree
Showing 14 changed files with 1,178 additions and 258 deletions.
1 change: 0 additions & 1 deletion .eslintrc.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ module.exports = {
extends: [
'@nuxtjs',
'prettier',
'prettier/vue',
'plugin:prettier/recommended',
'plugin:nuxt/recommended',
'plugin:json/recommended',
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ jobs:
WPT_SERVER: "webpagetest.httparchive.org"
WPT_API_KEY: ${{ secrets.HA_API_KEY }}
PR_NUMBER: ${{ github.event.pull_request.number }}
run: yarn test
run: yarn run test

- name: Run WebPageTest for more websites
id: wpt-test
Expand Down
45 changes: 45 additions & 0 deletions .github/workflows/upload.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
name: Tests

on:
push:
branches:
- main
paths:
- "src/technologies/*.json"
- "src/categories.json"
- "src/groups.json"
workflow_dispatch:

jobs:
test:
name: Test and upload to GCP
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
with:
ref: ${{ github.event.pull_request.head.sha }}
fetch-depth: 0

- name: Install dependencies
run: yarn install

- name: Validate
run: yarn run validate

- name: Run WebPageTest with unit tests
id: unit-test
env:
WPT_SERVER: "webpagetest.httparchive.org"
WPT_API_KEY: ${{ secrets.HA_API_KEY }}
PR_NUMBER: ${{ github.event.pull_request.number }}
run: yarn run test

- name: Upload to GCP
id: upload
env:
GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }}
GCP_SA_KEY: ${{ secrets.GCP_SA_KEY }}
run: |
echo $GCP_SA_KEY > /tmp/gcp_key.json
yarn run upload
13 changes: 8 additions & 5 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
"convert-svg-to-png": "^0.5.0"
},
"devDependencies": {
"@google-cloud/bigquery": "^7.7.0",
"@google-cloud/storage": "^7.11.0",
"@nuxtjs/eslint-config": "^3.1.0",
"@nuxtjs/eslint-module": "^2.0.0",
"babel-eslint": "^10.1.0",
Expand All @@ -19,12 +21,13 @@
"webpagetest": "github:HTTPArchive/WebPageTest.api-nodejs"
},
"scripts": {
"lint": "eslint src/**/*.{js,json} tests/**/*.js bin/**/*.js && jsonlint -jsV ./schema.json --trim-trailing-commas --enforce-double-quotes ./src/technologies/ && jsonlint -js --trim-trailing-commas --enforce-double-quotes ./src/categories.json",
"lint:fix": "eslint --fix src/**/*.{js,json} tests/**/*.js bin/**/*.js && jsonlint -isV ./schema.json --trim-trailing-commas --enforce-double-quotes ./src/technologies/ && jsonlint -is --trim-trailing-commas --enforce-double-quotes ./src/categories.json",
"validate": "yarn run lint && node ./bin/validate.js",
"lint": "eslint src/**/*.{js,json} tests/**/*.js scripts/**/*.js && jsonlint -jsV ./schema.json --trim-trailing-commas --enforce-double-quotes ./src/technologies/ && jsonlint -js --trim-trailing-commas --enforce-double-quotes ./src/categories.json",
"lint:fix": "eslint --fix src/**/*.{js,json} tests/**/*.js scripts/**/*.js && jsonlint -isV ./schema.json --trim-trailing-commas --enforce-double-quotes ./src/technologies/ && jsonlint -is --trim-trailing-commas --enforce-double-quotes ./src/categories.json",
"validate": "yarn run lint && node ./scripts/validate.js",
"test": "jest",
"convert": "node --no-warnings ./bin/convert.js",
"build": "yarn run validate && yarn run convert && node ./bin/build.js"
"upload": "node ./scripts/upload_technology.js",
"convert": "node --no-warnings ./scripts/convert.js",
"build": "yarn run validate && yarn run convert && node ./scripts/build.js"
},
"jest": {
"reporters": [
Expand Down
File renamed without changes.
File renamed without changes.
58 changes: 58 additions & 0 deletions scripts/upload_icons.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
/* eslint-disable no-console */
const fs = require('fs')
const path = require('path')
const { Storage } = require('@google-cloud/storage')

// Configuration
const BUCKET_NAME = 'technology_detections'
const ICONS_DIR = path.resolve(__dirname, '../src/images/icons/converted') // Local directory where your PNG icons are stored

const storage = new Storage({
keyFilename: '/tmp/gcp_key.json',
})

async function syncIcons() {
const bucket = storage.bucket(BUCKET_NAME)

// Get list of files in the bucket
const [filesInBucket] = await bucket.getFiles()
const bucketFilesMap = new Map(
filesInBucket.map((file) => [
file.name,
new Date(file.metadata.updated).getTime(),
])
)

// Read all files from the local icons directory
const localFiles = fs
.readdirSync(ICONS_DIR)
.filter((file) => file.endsWith('.png'))

for (const file of localFiles) {
const filePath = path.join(ICONS_DIR, file)
const fileMetadata = fs.statSync(filePath)
const fileInBucketUpdatedTime = bucketFilesMap.get(file)

// Upload file if it's new or has been updated
if (
!fileInBucketUpdatedTime ||
fileMetadata.mtime.getTime() > fileInBucketUpdatedTime
) {
try {
await bucket.upload(filePath, {
destination: 'icons/' + file,
metadata: {
contentType: 'image/png',
},
})
console.log(`Uploaded: ${file}`)
} catch (err) {
console.error(`Error uploading file ${file}:`, err)
}
} else {
console.log(`File already exists and is up to date: ${file}`)
}
}
}

syncIcons().catch(console.error)
220 changes: 220 additions & 0 deletions scripts/upload_technology.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
/* eslint-disable no-console */
// A script to upload technologies and their categories to BigQuery.

const fs = require('fs')
const path = require('path')
const { BigQuery } = require('@google-cloud/bigquery')

const readJsonFiles = (directory) => {
const files = fs.readdirSync(directory)
return files.reduce((mergedData, file) => {
const filePath = path.join(directory, file)
const data = fs.readFileSync(filePath, 'utf8')
return { ...mergedData, ...JSON.parse(data) }
}, {})
}

const getArray = (value) =>
typeof value === 'string' ? [value] : Array.isArray(value) ? value : []

const getRuleObject = (value) => {
if (typeof value === 'string') {
return [{ name: value, value: null }]
}
if (Array.isArray(value)) {
return value.map((key) => ({ name: key, value: null }))
}
if (typeof value === 'object') {
return Object.keys(value).map((key) => ({
name: key,
value:
typeof value[key] === 'object'
? JSON.stringify(value[key])
: value[key].toString(),
}))
}
return []
}

const loadToBigQuery = async (
data,
tableName = 'apps',
datasetName = 'wappalyzer',
writeDisposition = 'WRITE_TRUNCATE',
sourceFormat = 'NEWLINE_DELIMITED_JSON'
) => {
if (!data) {
throw new Error(`No data to load to \`${datasetName}.${tableName}\`.`)
}

const bigquery = new BigQuery({
keyFilename: '/tmp/gcp_key.json',
})
const schema = {
fields: [
{ name: 'name', type: 'STRING' },
{ name: 'categories', type: 'STRING', mode: 'REPEATED' },
{ name: 'website', type: 'STRING' },
{ name: 'description', type: 'STRING' },
{ name: 'icon', type: 'STRING' },
{ name: 'cpe', type: 'STRING' },
{ name: 'saas', type: 'BOOLEAN' },
{ name: 'oss', type: 'BOOLEAN' },
{ name: 'pricing', type: 'STRING', mode: 'REPEATED' },
{ name: 'implies', type: 'STRING', mode: 'REPEATED' },
{ name: 'requires', type: 'STRING', mode: 'REPEATED' },
{ name: 'requiresCategory', type: 'STRING', mode: 'REPEATED' },
{ name: 'excludes', type: 'STRING', mode: 'REPEATED' },
{
name: 'cookies',
type: 'RECORD',
mode: 'REPEATED',
fields: [
{ name: 'name', type: 'STRING' },
{ name: 'value', type: 'STRING' },
],
},
{
name: 'dom',
type: 'RECORD',
mode: 'REPEATED',
fields: [
{ name: 'name', type: 'STRING' },
{ name: 'value', type: 'STRING' },
],
},
{
name: 'dns',
type: 'RECORD',
mode: 'REPEATED',
fields: [
{ name: 'name', type: 'STRING' },
{ name: 'value', type: 'STRING' },
],
},
{
name: 'js',
type: 'RECORD',
mode: 'REPEATED',
fields: [
{ name: 'name', type: 'STRING' },
{ name: 'value', type: 'STRING' },
],
},
{
name: 'headers',
type: 'RECORD',
mode: 'REPEATED',
fields: [
{ name: 'name', type: 'STRING' },
{ name: 'value', type: 'STRING' },
],
},
{ name: 'text', type: 'STRING', mode: 'REPEATED' },
{ name: 'css', type: 'STRING', mode: 'REPEATED' },
{
name: 'probe',
type: 'RECORD',
mode: 'REPEATED',
fields: [
{ name: 'name', type: 'STRING' },
{ name: 'value', type: 'STRING' },
],
},
{ name: 'robots', type: 'STRING', mode: 'REPEATED' },
{ name: 'url', type: 'STRING', mode: 'REPEATED' },
{ name: 'xhr', type: 'STRING', mode: 'REPEATED' },
{
name: 'meta',
type: 'RECORD',
mode: 'REPEATED',
fields: [
{ name: 'name', type: 'STRING' },
{ name: 'value', type: 'STRING' },
],
},
{ name: 'scriptSrc', type: 'STRING', mode: 'REPEATED' },
{ name: 'script', type: 'STRING', mode: 'REPEATED' },
{ name: 'html', type: 'STRING', mode: 'REPEATED' },
],
}

const options = { schema, sourceFormat, writeDisposition }
const [job] = await bigquery
.dataset(datasetName)
.table(tableName)
.load(data, options)

if (job.status.errors && job.status.errors.length > 0) {
console.error('Errors encountered:', job.status.errors)
throw new Error('Error loading data into BigQuery')
}

console.log(
`Loaded ${job.numRowsLoaded} rows into ${datasetName}.${tableName}...`
)
}

const main = async () => {
const technologies = readJsonFiles('./src/technologies')
const categories = JSON.parse(
fs.readFileSync('./src/categories.json', 'utf8')
)

const transformedTechnologies = Object.keys(technologies).map((key) => {
const app = {
name: key,
categories: technologies[key].cats.map(
(category) => categories[category].name
),
}

;[
'implies',
'requires',
'requiresCategory',
'excludes',
'text',
'css',
'robots',
'url',
'xhr',
'scriptSrc',
'script',
'html',
].forEach((field) => {
app[field] = getArray(technologies[key][field])
})
;['cookies', 'dom', 'dns', 'js', 'headers', 'probe', 'meta'].forEach(
(field) => {
app[field] = getRuleObject(technologies[key][field])
}
)
;[
'website',
'description',
'icon',
'cpe',
'saas',
'oss',
'pricing',
].forEach((field) => {
app[field] = technologies[key][field]
})

return app
})

const transformedTechnologiesJsonL = transformedTechnologies
.map((line) => JSON.stringify(line))
.join('\n')
const filePath = './transformedTechnologies.jsonl'
fs.writeFileSync(filePath, transformedTechnologiesJsonL)

await loadToBigQuery(filePath, 'apps')

// cleanup file
fs.unlinkSync(filePath)
}

main().catch(console.error)
File renamed without changes.
2 changes: 1 addition & 1 deletion src/technologies/r.json
Original file line number Diff line number Diff line change
Expand Up @@ -632,10 +632,10 @@
"description": "React Router provides declarative routing for React.",
"icon": "React Router.svg",
"implies": "React",
"oss": true,
"js": {
"__reactRouterVersion": "([\\d\\.]+)\\;version:\\1"
},
"oss": true,
"website": "https://reactrouter.com"
},
"Reactive": {
Expand Down
2 changes: 1 addition & 1 deletion src/technologies/s.json
Original file line number Diff line number Diff line change
Expand Up @@ -4132,8 +4132,8 @@
"description": "Slider Revolution is a flexible and highly customisable slider.",
"icon": "Slider Revolution.svg",
"js": {
"SR7.version": "^Slider Revolution\\s([\\d\\.]+)$\\;version:\\1",
"RS_MODULES.main.version": "^Slider Revolution\\s([\\d\\.]+)$\\;version:\\1",
"SR7.version": "^Slider Revolution\\s([\\d\\.]+)$\\;version:\\1",
"revapi1": "",
"revapi2": "",
"revapi3": "",
Expand Down
Loading

0 comments on commit 603be50

Please sign in to comment.