Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
f78096c
refactor(logging): remove winston-azure-application-insights, update …
elaine-mattos Jul 17, 2025
d9a1c4d
chore: add env.json to gitignore
elaine-mattos Jul 17, 2025
efaf0b2
feat: upgrade to node 20
elaine-mattos Jul 17, 2025
c2a815a
feat: improve reuse version detection
elaine-mattos Jul 17, 2025
3b062e2
fix(scancode): improve ScanCode version detection and error logging
elaine-mattos Jul 17, 2025
707e478
fix: update scancode vr on test fixtures
elaine-mattos Jul 17, 2025
498d025
refactor(processors): improve logging and file path handling
elaine-mattos Jul 23, 2025
f9e67b2
build(docker): update Scancode to 32.3.3 and REUSE to 5.0.2
elaine-mattos Jul 23, 2025
e1d1f60
refactor(config): split file store location into definition and attac…
elaine-mattos Jul 23, 2025
f8c3321
fix: adjust release date assertion to handle UTC timestamps
elaine-mattos Jul 23, 2025
28ee3fa
fix: update to node 24
elaine-mattos Jul 23, 2025
7e02f85
fix: add Dockerfile and package.json
elaine-mattos Jul 23, 2025
e51d106
fix: add formatting with prettier
elaine-mattos Jul 24, 2025
0591f34
feat: test k8s implementation
elaine-mattos Aug 21, 2025
cbedf91
test: add more log lines
elaine-mattos Sep 8, 2025
ba6ee3e
fix: add defensive logging to try and catch object corruption in k8s
elaine-mattos Sep 8, 2025
a8fa22b
test: try and return to old stands
elaine-mattos Sep 8, 2025
a7aaf4c
fix: add proxy agent to axios
elaine-mattos Sep 9, 2025
9f475d7
fix: use named import
elaine-mattos Sep 9, 2025
54baaf9
fix: use corporate registry
elaine-mattos Sep 9, 2025
1636389
test: try and catch the bad requests
elaine-mattos Sep 10, 2025
c959add
fix: remove old test code
elaine-mattos Sep 22, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ jobs:

- uses: actions/setup-node@v4
with:
node-version: 18
node-version: 24
cache: 'npm'

- name: Install dependencies
Expand Down
26 changes: 24 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,8 +1,30 @@

# Dependency directories
node_modules/


# Log files
npm-debug.log


# Coverage output (used by tools like Istanbul/nyc)
coverage/
.DS_Store
.npmrc
.nyc_output


# System files
.DS_Store


# Editor and OS backup files
*~
\#*#


# Configuration files
.npmrc


# Environment variable files
.env
env.json
8 changes: 4 additions & 4 deletions DevDockerfile
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
# Copyright (c) Microsoft Corporation and others. Licensed under the MIT license.
# SPDX-License-Identifier: MIT

FROM node:18-bullseye
FROM node:24-bullseye
ENV APPDIR=/opt/service

# Set environment variables from build arguments
ARG BUILD_NUMBER=0
ENV BUILD_NUMBER=$APP_VERSION
ARG APP_VERSION="UNKNOWN"
ENV BUILD_NUMBER=$APP_VERSION
ENV APP_VERSION=$APP_VERSION
ARG BUILD_SHA="UNKNOWN"
ENV BUILD_SHA=$BUILD_SHA
Expand Down Expand Up @@ -54,11 +54,11 @@ RUN mkdir -p "${APPDIR}" && cp -a /tmp/node_modules "${APPDIR}"
WORKDIR "${APPDIR}"
COPY . "${APPDIR}"

ENV NODE_ENV "localhost"
ENV NODE_ENV="localhost"

# Uncomment this if you want to see debug output
#ENV DEBUG=*

ENV PORT 5000
ENV PORT=5000
EXPOSE 5000
ENTRYPOINT ["node", "index.js"]
8 changes: 4 additions & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Copyright (c) Microsoft Corporation and others. Licensed under the MIT license.
# SPDX-License-Identifier: MIT

FROM node:18-bullseye
FROM node:24-bullseye
ENV APPDIR=/opt/service

# Set environment variables from build arguments
Expand All @@ -19,7 +19,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends --no-install-su
gem install bundler -v 2.5.4 --no-document

# Scancode
ARG SCANCODE_VERSION="32.1.0"
ARG SCANCODE_VERSION="32.3.3"
RUN pip3 install --upgrade pip setuptools wheel && \
curl -Os https://raw.githubusercontent.com/nexB/scancode-toolkit/v$SCANCODE_VERSION/requirements.txt && \
pip3 install --constraint requirements.txt scancode-toolkit==$SCANCODE_VERSION && \
Expand All @@ -40,7 +40,7 @@ RUN gem install nokogiri:1.16.0 --no-document && \

# REUSE
RUN pip3 install setuptools
RUN pip3 install reuse==3.0.1
RUN pip3 install reuse==5.0.2

# Crawler config
ENV CRAWLER_DEADLETTER_PROVIDER=cd(azblob)
Expand All @@ -62,6 +62,6 @@ RUN mkdir -p "${APPDIR}" && cp -a /tmp/node_modules "${APPDIR}"
WORKDIR "${APPDIR}"
COPY . "${APPDIR}"

ENV PORT 5000
ENV PORT=5000
EXPOSE 5000
ENTRYPOINT ["node", "index.js"]
14 changes: 14 additions & 0 deletions ghcrawler/lib/crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,20 @@ class Crawler {
// else release
// if release fails abandon as everyone will think it is still in the queue
// else delete

// TODO ELAINE: remove this commented code later
// if (!request.type || !request.url) {
// console.log(`[ERROR] Request object corrupted: type=${request.type}, url=${request.url}`)
// console.log(`[ERROR] Request keys:`, Object.keys(request).slice(0, 10))
// request = Request.adopt(request)
// }

if (request && typeof request.toUniqueString !== 'function') {
console.log(
`[DEBUG] Calling adopt in _completeRequest for ${request ? request.type : 'undefined'}@${request ? request.url : 'undefined'}`
)
request = Request.adopt(request)
}
const loopName = request.meta ? request.meta.loopName : ''
debug(`_completeRequest(${loopName}:${request.toUniqueString()}): enter (force: ${forceRequeue})`)
const self = this
Expand Down
20 changes: 20 additions & 0 deletions ghcrawler/lib/request.js
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,26 @@ class Request {
return object
}

// static adopt(object) {
// console.log(`[DEBUG] Request.adopt called for ${object ? object.type : 'undefined'}@${object ? object.url : 'undefined'}`);
// console.log(`[DEBUG] Before adoption: hasProto=${!!object}, isRequestProto=${object && object.__proto__ === Request.prototype}`);

// if (object && object.__proto__ !== Request.prototype) {
// console.log(`[DEBUG] Restoring prototype chain to Request`);
// object.__proto__ = Request.prototype;
// }

// if (object && object.policy) {
// object.policy = Request._getResolvedPolicy(object);
// Policy.adopt(object.policy);
// } else if (object && object.type) {
// object.policy = Policy.default(object.type);
// }

// console.log(`[DEBUG] After adoption: hasToUniqueString=${object && typeof object.toUniqueString === 'function'}`);

// return object;
// }
static _getResolvedPolicy(request) {
let policyOrSpec = request.policy
if (typeof policyOrSpec !== 'string') {
Expand Down
12 changes: 11 additions & 1 deletion lib/fetch.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,15 @@
// SPDX-License-Identifier: MIT

const axios = require('axios')
const { HttpsProxyAgent } = require('https-proxy-agent')

const defaultHeaders = Object.freeze({ 'User-Agent': 'clearlydefined.io crawler ([email protected])' })

axios.defaults.headers = defaultHeaders

const httpsProxy = process.env.HTTPS_PROXY || process.env.https_proxy
const httpsAgent = httpsProxy ? new HttpsProxyAgent(httpsProxy) : undefined

function buildRequestOptions(request) {
let responseType = 'text'
if (request.json) {
Expand All @@ -26,6 +30,8 @@ function buildRequestOptions(request) {
responseType,
headers: request.headers,
data: request.body,
httpsAgent,
proxy: false, // to make sure the httpsAgent proxy will be used if set
...validateOptions
}
}
Expand All @@ -45,7 +51,11 @@ async function callFetch(request, axiosInstance = axios) {
}

function withDefaults(opts) {
const axiosInstance = axios.create(opts)
const axiosInstance = axios.create({
...opts,
httpsAgent,
proxy: false
})
return request => callFetch(request, axiosInstance)
}

Expand Down
Loading