From fc1d0a431f9278768d39f5d72b72c6f1da760270 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Wed, 6 Sep 2023 16:30:35 -0700 Subject: [PATCH 01/19] Create functions to determine Maven URLs #179 Signed-off-by: Jono Yang --- minecode/visitors/maven.py | 155 ++++++++++++++++++++++++++++++++++++- 1 file changed, 154 insertions(+), 1 deletion(-) diff --git a/minecode/visitors/maven.py b/minecode/visitors/maven.py index 7fd70ac7..12ab6973 100644 --- a/minecode/visitors/maven.py +++ b/minecode/visitors/maven.py @@ -13,7 +13,9 @@ import io import json import logging +import re from typing import Dict +from urllib.parse import urlparse import arrow import requests @@ -305,7 +307,7 @@ def map_maven_package(package_url, package_content): ancestor_pom_texts=ancestor_pom_texts, package=package ) - + urls = get_urls( namespace=package_url.namespace, @@ -453,6 +455,157 @@ def process_request(purl_str): return error +collect_links = re.compile('href="([^"]+)"').findall + + +def check_if_file_name_is_linked_on_page(file_name, links, **kwargs): + """ + Return True if `file_name` is in `links` + """ + return any(l.endswith(file_name) for l in links) + + +def check_if_page_has_pom_files(links, **kwargs): + """ + Return True of any entry in `links` ends with .pom. + """ + return any(l.endswith('.pom') for l in links) + + +def check_if_page_has_directories(links, **kwargs): + """ + Return True if any entry, excluding "../", ends with /. + """ + return any(l.endswith('/') for l in links if l != '../') + + +def check_if_package_version_page(links, **kwargs): + """ + Return True if `links` contains pom files and has no directories + """ + return ( + check_if_page_has_pom_files(links=links) + and not check_if_page_has_directories(links=links) + ) + + +def check_if_package_page(links, **kwargs): + return ( + check_if_file_name_is_linked_on_page(file_name='maven-metadata.xml', links=links) + and not check_if_page_has_pom_files(links=links) + ) + + +def check_if_maven_root(links, **kwargs): + """ + Return True if "archetype-catalog.xml" is in `links`, as the root of a Maven + repo contains "archetype-catalog.xml". + """ + return check_if_file_name_is_linked_on_page(file_name='archetype-catalog.xml', links=links) + + +def check_on_page(url, checker): + """ + Return True if there is a link on `url` that is the same as `file_name`, + False otherwise. + """ + response = requests.get(url) + if response: + links = collect_links(response.text) + return checker(links=links) + return False + + +def is_maven_root(url): + """ + Return True if `url` is the root of a Maven repo, False otherwise. + """ + return check_on_page(url, check_if_maven_root) + + +def is_package_page(url): + """ + Return True if `url` is a package page on a Maven repo, False otherwise. + """ + return check_on_page(url, check_if_package_page) + + +def is_package_version_page(url): + """ + Return True if `url` is a package version page on a Maven repo, False otherwise. + """ + return check_on_page(url, check_if_package_version_page) + + +def url_parts(url): + parsed_url = urlparse(url) + scheme = parsed_url.scheme + netloc = parsed_url.netloc + path_segments = [p for p in parsed_url.path.split('/') if p] + return scheme, netloc, path_segments + + +def create_url(scheme, netloc, path_segments): + url_template = f'{scheme}://{netloc}' + path = '/'.join(path_segments) + return f'{url_template}/{path}' + + +def get_maven_root(url): + """ + Given `url`, that is a URL to namespace, package, or artifact in a Maven + repo, return the URL to the root of that repo. If a Maven root cannot be + determined, return None. + + >>> get_maven_root('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/') + 'https://repo1.maven.org/maven2' + """ + scheme, netloc, path_segments = url_parts(url) + for i in range(len(path_segments)): + segments = path_segments[:i+1] + url_segment = create_url(scheme, netloc, segments) + if is_maven_root(url_segment): + return url_segment + return None + + +def determine_namespace_name_version_from_url(url): + """ + Return a 3-tuple containing strings of a Package namespace, name, and + version, determined from `url`, where `url` points to namespace, package, + specific package version, or artifact on a Maven repo. + + Return None if a Maven root cannot be determined from `url`. + + >>> determine_namespace_name_version_from_url('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/') + ('net.shibboleth', 'parent', '7.11.0') + """ + root_url = get_maven_root(url) + if not root_url: + raise Exception(f'Error: not a Maven repository: {url}') + + _, remaining_path_segments = url.split(root_url) + remaining_path_segments = remaining_path_segments.split('/') + remaining_path_segments = [p for p in remaining_path_segments if p] + + namespace_segments = [] + package_name = '' + package_version = '' + for i in range(len(remaining_path_segments)): + segment = remaining_path_segments[i] + segments = remaining_path_segments[:i+1] + path = '/'.join(segments) + url_segment = f'{root_url}/{path}' + if is_package_page(url_segment): + package_name = segment + elif is_package_version_page(url_segment): + package_version = segment + else: + namespace_segments.append(segment) + namespace = '.'.join(namespace_segments) + return namespace, package_name, package_version + + @visit_router.route('http://repo1\.maven\.org/maven2/\.index/nexus-maven-repository-index.properties') @visit_router.route('https://repo1\.maven\.org/maven2/\.index/nexus-maven-repository-index.properties') class MavenNexusPropertiesVisitor(NonPersistentHttpVisitor): From 245be34233bae651957f0f5f2bd640aca3f121e4 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Mon, 11 Sep 2023 18:12:42 -0700 Subject: [PATCH 02/19] Implement ImportableURI queue #179 Signed-off-by: Jono Yang --- minecode/management/commands/import_queue.py | 145 +++++++++++++++ minecode/migrations/0031_importableuri.py | 181 +++++++++++++++++++ minecode/models.py | 134 +++++++++++++- minecode/visitors/maven.py | 160 +++++++++++++++- 4 files changed, 615 insertions(+), 5 deletions(-) create mode 100644 minecode/management/commands/import_queue.py create mode 100644 minecode/migrations/0031_importableuri.py diff --git a/minecode/management/commands/import_queue.py b/minecode/management/commands/import_queue.py new file mode 100644 index 00000000..3d14e0b8 --- /dev/null +++ b/minecode/management/commands/import_queue.py @@ -0,0 +1,145 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import logging +import signal +import sys +import time + +import requests + +from django.db import transaction +from django.utils import timezone +from packageurl import PackageURL + +from minecode.management.commands import get_error_message +from minecode.management.commands import VerboseCommand +from minecode.models import ImportableURI +from minecode.visitors.maven import get_artifact_links +from minecode.visitors.maven import get_classifier_from_artifact_url +from minecode.visitors.maven import collect_links_from_text +from minecode.visitors.maven import filter_only_directories +from minecode.visitors.maven import get_artifact_sha1 +from packagedb.models import Package + + +logger = logging.getLogger(__name__) +logging.basicConfig(stream=sys.stdout) +logger.setLevel(logging.INFO) + +TRACE = False +if TRACE: + logger.setLevel(logging.DEBUG) + +# sleep duration in seconds when the queue is empty +SLEEP_WHEN_EMPTY = 10 + +MUST_STOP = False + + +def stop_handler(*args, **kwargs): + """ + Signal handler to set global variable to True. + """ + global MUST_STOP + MUST_STOP = True + + +signal.signal(signal.SIGTERM, stop_handler) + + +class Command(VerboseCommand): + help = 'Run a Package request queue.' + + def handle(self, *args, **options): + """ + Get the next processable PriorityResourceURI and start the + processing. Loops forever and sleeps a short while if there are + no PriorityResourceURI left to process. + """ + + global MUST_STOP + + sleeping = False + processed_counter = 0 + + while True: + if MUST_STOP: + logger.info('Graceful exit of the request queue.') + break + + with transaction.atomic(): + importable_uri = ImportableURI.objects.get_next_request() + + if not importable_uri: + # Only log a single message when we go to sleep + if not sleeping: + sleeping = True + logger.info('No more processable request, sleeping...') + + time.sleep(SLEEP_WHEN_EMPTY) + continue + + sleeping = False + + # process request + logger.info('Processing {}'.format(importable_uri)) + try: + errors = process_request(importable_uri) + except Exception as e: + errors = 'Error: Failed to process ImportableURI: {}\n'.format( + repr(importable_uri)) + errors += get_error_message(e) + finally: + if errors: + importable_uri.processing_error = errors + logger.error(errors) + importable_uri.processed_date = timezone.now() + importable_uri.wip_date = None + importable_uri.save() + processed_counter += 1 + + return processed_counter + + +def process_request(importable_uri): + uri = importable_uri.uri + uri = uri.rstrip('/') + data = importable_uri.data + if not data: + # collect data again if we don't have it + response = requests.get(uri) + if response: + data = requests.text + + package_url = PackageURL.from_string(importable_uri.package_url) + namespace = package_url.namespace + name = package_url.name + + # Go into each version directory + for link in collect_links_from_text(data, filter_only_directories): + version = link.rstrip('/') + version_page_url = f'{uri}/{version}' + for artifact_link in get_artifact_links(version_page_url): + sha1 = get_artifact_sha1(artifact_link) + classifier = get_classifier_from_artifact_url(artifact_link) + qualifiers = None + if classifier: + qualifiers = f'classifier={classifier}' + package = Package.objects.create( + type='maven', + namespace=namespace, + name=name, + version=version, + qualifiers=qualifiers, + download_url=artifact_link, + sha1=sha1, + ) + if package: + logger.info('Created package {package}') diff --git a/minecode/migrations/0031_importableuri.py b/minecode/migrations/0031_importableuri.py new file mode 100644 index 00000000..0d557312 --- /dev/null +++ b/minecode/migrations/0031_importableuri.py @@ -0,0 +1,181 @@ +# Generated by Django 4.1.2 on 2023-09-12 00:14 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("minecode", "0030_scannableuri_rescan_alter_scannableuri_scan_status"), + ] + + operations = [ + migrations.CreateModel( + name="ImportableURI", + fields=[ + ( + "id", + models.AutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "uri", + models.CharField( + db_index=True, + help_text="URI for this resource. This is the unmodified original URI.", + max_length=2048, + ), + ), + ( + "canonical", + models.CharField( + db_index=True, + help_text="Canonical form of the URI for this resource that must be unique across all ResourceURI.", + max_length=3000, + ), + ), + ( + "source_uri", + models.CharField( + blank=True, + help_text="Optional: real source remote URI for this visit.For example for a package repository index is a typical source via which a first level of package data is fetched. And it is not the URI in the uri field. It is just the source of the fetchOr the source may be a mirror URI used for fetching.", + max_length=2048, + null=True, + ), + ), + ( + "priority", + models.PositiveIntegerField( + db_index=True, + default=0, + help_text="Absolute procdssing priority of a URI (default to zero), higher number means higher priority, zero means lowest priority.", + ), + ), + ( + "wip_date", + models.DateTimeField( + blank=True, + db_index=True, + help_text="Work In Progress. This is a timestamp set at the start of a visit or mapping or indexing or null when no processing is in progress.", + null=True, + ), + ), + ( + "file_name", + models.CharField( + blank=True, + db_index=True, + help_text="File name of a resource sometimes part of the URI proper and sometimes only available through an HTTP header.", + max_length=255, + null=True, + ), + ), + ( + "size", + models.PositiveIntegerField( + blank=True, + db_index=True, + help_text="Size in bytes of the file represented by this ResourceURI.", + null=True, + ), + ), + ( + "sha1", + models.CharField( + blank=True, + db_index=True, + help_text="SHA1 checksum hex-encoded (as in the sha1sum command) of the content of the file represented by this ResourceURI.", + max_length=40, + null=True, + ), + ), + ( + "md5", + models.CharField( + blank=True, + db_index=True, + help_text="MD5 checksum hex-encoded (as in the md5sum command) of the content of the file represented by this ResourceURI.", + max_length=32, + null=True, + ), + ), + ( + "sha256", + models.CharField( + blank=True, + db_index=True, + help_text="SHA256 checksum hex-encoded (as in the sha256sum command) of the content of the file represented by this ResourceURI.", + max_length=64, + null=True, + ), + ), + ( + "last_modified_date", + models.DateTimeField( + blank=True, + db_index=True, + help_text="Timestamp set to the last modified date of the remote resource represented by this URI such as the modified date of a file, the lastmod value on a sitemap or the modified date returned by an HTTP resource.", + null=True, + ), + ), + ( + "package_url", + models.CharField( + blank=True, + db_index=True, + help_text='Package URL for this resource. It stands for a package "mostly universal" URL.', + max_length=2048, + null=True, + ), + ), + ( + "data", + models.TextField( + blank=True, + help_text="Text content of the file represented by this ResourceURI. This contains the data that was fetched or extracted from a remote ResourceURI such as HTML or JSON.", + null=True, + ), + ), + ( + "request_date", + models.DateTimeField( + blank=True, + db_index=True, + help_text="Timestamp set to the date of when this Package info was requested.", + null=True, + ), + ), + ( + "processed_date", + models.DateTimeField( + blank=True, + db_index=True, + help_text="Timestamp set to the date of when this Package info was processed.", + null=True, + ), + ), + ( + "has_processing_error", + models.BooleanField( + db_index=True, + default=False, + help_text="When set to True (Yes), this field indicates that an error has occured when processing this URI.", + ), + ), + ( + "processing_error", + models.TextField( + blank=True, + help_text="Processing errors messages. When present this means the processing failed.", + null=True, + ), + ), + ], + options={ + "verbose_name": "Importable URI", + }, + ), + ] diff --git a/minecode/models.py b/minecode/models.py index 16c44854..3efcb040 100644 --- a/minecode/models.py +++ b/minecode/models.py @@ -31,10 +31,6 @@ logging.basicConfig(stream=sys.stdout) logger.setLevel(logging.INFO) -# logger = logging.getLogger(__name__) -# handler = logging.StreamHandler() -# logger.addHandler(handler) - def get_canonical(uri): """ @@ -936,3 +932,133 @@ def save(self, *args, **kwargs): """ self.normalize_fields() super(PriorityResourceURI, self).save(*args, **kwargs) + + +# TODO: Use the QuerySet.as_manager() for more flexibility and chaining. +class ImportableURIManager(models.Manager): + def insert(self, uri, data, **extra_fields): + """ + Create and return a new ImportableURI + Return None if the insertion failed when the same URI exists with the same versions to be collected + """ + # TODO: be able to create a request for an existing purl if the previous request has been completed already + + importable_uris = self.filter( + uri=uri, + **extra_fields + ) + if ( + importable_uris.count() == 0 + or all(p.processed_date for p in importable_uris) + ): + importable_uri = self.create( + uri=uri, + data=data, + **extra_fields + ) + return importable_uri + + def in_progress(self): + """ + Limit the QuerySet to ImportableURI being processed. + """ + return self.filter(wip_date__isnull=False) + + def never_processed(self): + """ + Limit the QuerySet to ImportableURIs that have never been processed. + This is usually the state of a ImportableURI after upon creation. + """ + return self.filter( + processed_date__isnull=True, + wip_date__isnull=True + ).order_by( + 'request_date' + ) + + def get_requests(self): + """ + Return an ordered query set of all processable ImportableURIs. + """ + never_processed = self.never_processed() + return never_processed + + def get_next_request(self): + """ + Return the next ImportableURI request for processing and mark it + as being "in_progress" by setting the wip_date field. + + Return None when there is no request left to visit. + + NOTE: this method can only be called from within a transaction.atomic + block. + """ + importable_uri = self.get_requests().select_for_update(skip_locked=True).first() + if not importable_uri: + return + importable_uri.wip_date = timezone.now() + importable_uri.save(update_fields=['wip_date']) + return importable_uri + + +# TODO: have a second queue for crawling maven repo, that tracks which pages and namespaces we visited +# when we hit the point of a package page, we add it to the queue that creates skinny packages for the package we visited. + +class ImportableURI(BaseURI): + package_url = models.CharField( + max_length=2048, + null=True, + blank=True, + db_index=True, + help_text="""Package URL for this resource. It stands for a package "mostly universal" URL.""" + ) + + # This is a text blob that contains either HTML, JSON or anything + # stored as a string. This is the raw content of visiting a URI. + # NOTE: some visited URLS (such as an actual package archive will/shoud NOT be stored there) + data = models.TextField( + null=True, + blank=True, + help_text='Text content of the file represented by this ' + 'ResourceURI. This contains the data that was fetched or ' + 'extracted from a remote ResourceURI such as HTML or JSON.', + ) + + request_date = models.DateTimeField( + null=True, + blank=True, + db_index=True, + help_text='Timestamp set to the date of when this Package info was requested.', + ) + + processed_date = models.DateTimeField( + null=True, + blank=True, + db_index=True, + help_text='Timestamp set to the date of when this Package info was processed.', + ) + + has_processing_error = models.BooleanField( + db_index=True, + default=False, + help_text='When set to True (Yes), this field indicates that ' + 'an error has occured when processing this URI.' + ) + + processing_error = models.TextField( + null=True, + blank=True, + help_text='Processing errors messages. When present this means the processing failed.', + ) + + objects = ImportableURIManager() + + class Meta: + verbose_name = 'Importable URI' + + def save(self, *args, **kwargs): + """ + Save, adding defaults for computed fields and validating fields. + """ + self.normalize_fields() + super(ImportableURI, self).save(*args, **kwargs) diff --git a/minecode/visitors/maven.py b/minecode/visitors/maven.py index 12ab6973..57bc0c19 100644 --- a/minecode/visitors/maven.py +++ b/minecode/visitors/maven.py @@ -569,7 +569,7 @@ def get_maven_root(url): return None -def determine_namespace_name_version_from_url(url): +def determine_namespace_name_version_from_url(url, root_url): """ Return a 3-tuple containing strings of a Package namespace, name, and version, determined from `url`, where `url` points to namespace, package, @@ -606,6 +606,164 @@ def determine_namespace_name_version_from_url(url): return namespace, package_name, package_version +def add_to_import_queue(url): + """ + Create ImportableURI for the Maven repo package page at `url`. + """ + from minecode.models import ImportableURI + data = None + response = requests.get(url) + if response: + data = response.text + importable_uri = ImportableURI.objects.insert(url, data) + if importable_uri: + logger.info(f'Inserted {url} into ImportableURI queue') + + +def filter_only_directories(links): + return [l for l in links if l != '../' and l.endswith('/')] + + +valid_artifact_extensions = [ + 'ejb3', + 'ear', + 'aar', + 'apk', + 'gem', + 'jar', + 'nar', + # 'pom', + 'so', + 'swc', + 'tar', + 'tar.gz', + 'war', + 'xar', + 'zip', +] + + +def filter_for_artifacts(links): + artifacts = [] + for l in links: + for ext in valid_artifact_extensions: + if l.endswith(ext): + artifacts.append(l) + return artifacts + + +def collect_links_from_text(text, filter): + """ + Return a list of link locations, given HTML `text` content, that is filtered + using `filter`. + """ + links = collect_links(text) + links = filter(links=links) + return links + + +def create_absolute_urls_for_links(text, url, filter): + """ + Given the `text` contents from `url`, return a list of absolute URLs to + links from `url` that are filtered by `checker`. + """ + absolute_links = [] + url = url.rstrip('/') + for link in collect_links_from_text(text, filter): + if not link.startswith(url): + link = f'{url}/{link}' + absolute_links.append(link) + return absolute_links + + +def get_directory_links(url): + """ + Return a list of absolute directory URLs of the hyperlinks from `url` + """ + directory_links = [] + response = requests.get(url) + if response: + directory_links = create_absolute_urls_for_links( + response.text, + url=url, + filter=filter_only_directories + ) + return directory_links + + +def get_artifact_links(url): + """ + Return a list of absolute directory URLs of the hyperlinks from `url` + """ + directory_links = [] + response = requests.get(url) + if response: + directory_links = create_absolute_urls_for_links( + response.text, + url=url, + filter=filter_for_artifacts + ) + return directory_links + + +def crawl_to_package(url): + """ + Given a maven repo `url`, + """ + if is_package_page(url): + add_to_import_queue(url) + return + + for link in get_directory_links(url): + crawl_to_package(link) + + +def crawl_maven_repo_from_root(root_url): + """ + Given the `url` to a maven root, traverse the repo depth-first and add + packages to the import queue. + """ + crawl_to_package(root_url) + + +def get_artifact_sha1(artifact_url): + """ + Return the SHA1 value of the Maven artifact located at `artifact_url`. + """ + sha1 = None + artifact_sha1_url = f'{artifact_url}.sha1' + response = requests.get(artifact_sha1_url) + if response: + sha1_contents = response.text.strip().split() + sha1 = sha1_contents[0] + sha1 = validate_sha1(sha1) + return sha1 + + +def get_classifier_from_artifact_url(artifact_url, package_version_page_url, package_name, package_version): + """ + Return the classifier from a Maven artifact URL `artifact_url`, otherwise + return None if a classifier cannot be determined from `artifact_url` + """ + classifier = None + # https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0 + package_version_page_url = package_version_page_url.rstrip('/') + # https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0 + leading_url_portion = f'{package_version_page_url}/{package_name}-{package_version}' + # artifact_url = 'https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0-onejar.jar' + # ['', '-onejar.jar'] + _, remaining_url_portion = artifact_url.split(leading_url_portion) + # ['-onejar', 'jar'] + remaining_url_portions = remaining_url_portion.split('.') + if remaining_url_portions: + # '-onejar' + classifier = remaining_url_portion[0] + if classifier.startswith('-'): + # 'onejar' + classifier = classifier[1:] + return classifier + + @visit_router.route('http://repo1\.maven\.org/maven2/\.index/nexus-maven-repository-index.properties') @visit_router.route('https://repo1\.maven\.org/maven2/\.index/nexus-maven-repository-index.properties') class MavenNexusPropertiesVisitor(NonPersistentHttpVisitor): From d62332a960aee92d38dcb07dce509ebd79ec76b9 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Tue, 12 Sep 2023 14:38:20 -0700 Subject: [PATCH 03/19] Properly get classifier from url #179 Signed-off-by: Jono Yang --- minecode/management/commands/import_queue.py | 26 ++++++++++++++------ minecode/visitors/maven.py | 4 +-- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/minecode/management/commands/import_queue.py b/minecode/management/commands/import_queue.py index 3d14e0b8..68dcd29f 100644 --- a/minecode/management/commands/import_queue.py +++ b/minecode/management/commands/import_queue.py @@ -26,7 +26,10 @@ from minecode.visitors.maven import collect_links_from_text from minecode.visitors.maven import filter_only_directories from minecode.visitors.maven import get_artifact_sha1 +from minecode.model_utils import merge_or_create_package +from packagedcode.models import PackageData from packagedb.models import Package +from minecode.visitors.maven import determine_namespace_name_version_from_url logger = logging.getLogger(__name__) @@ -118,9 +121,8 @@ def process_request(importable_uri): if response: data = requests.text - package_url = PackageURL.from_string(importable_uri.package_url) - namespace = package_url.namespace - name = package_url.name + # TODO: determine namespace from initial traversal + namespace, name, _ = determine_namespace_name_version_from_url(uri) # Go into each version directory for link in collect_links_from_text(data, filter_only_directories): @@ -128,11 +130,11 @@ def process_request(importable_uri): version_page_url = f'{uri}/{version}' for artifact_link in get_artifact_links(version_page_url): sha1 = get_artifact_sha1(artifact_link) - classifier = get_classifier_from_artifact_url(artifact_link) + classifier = get_classifier_from_artifact_url(artifact_link, version_page_url, name, version) qualifiers = None if classifier: qualifiers = f'classifier={classifier}' - package = Package.objects.create( + package_data = PackageData( type='maven', namespace=namespace, name=name, @@ -141,5 +143,15 @@ def process_request(importable_uri): download_url=artifact_link, sha1=sha1, ) - if package: - logger.info('Created package {package}') + package, created, merged, map_error = merge_or_create_package( + scanned_package=package_data, + visit_level=50 + ) + if created: + logger.info(f'Created package {package}') + if merged: + logger.info(f'Updated package {package}') + if map_error: + logger.error(f'Error encountered: {map_error}') + importable_uri.processing_error = map_error + importable_uri.save() diff --git a/minecode/visitors/maven.py b/minecode/visitors/maven.py index 57bc0c19..f4897c7c 100644 --- a/minecode/visitors/maven.py +++ b/minecode/visitors/maven.py @@ -569,7 +569,7 @@ def get_maven_root(url): return None -def determine_namespace_name_version_from_url(url, root_url): +def determine_namespace_name_version_from_url(url): """ Return a 3-tuple containing strings of a Package namespace, name, and version, determined from `url`, where `url` points to namespace, package, @@ -755,7 +755,7 @@ def get_classifier_from_artifact_url(artifact_url, package_version_page_url, pac _, remaining_url_portion = artifact_url.split(leading_url_portion) # ['-onejar', 'jar'] remaining_url_portions = remaining_url_portion.split('.') - if remaining_url_portions: + if remaining_url_portions and remaining_url_portions[0]: # '-onejar' classifier = remaining_url_portion[0] if classifier.startswith('-'): From b0880295a574afbbeca3ae12f300688d80d19284 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Tue, 19 Sep 2023 16:26:59 -0700 Subject: [PATCH 04/19] Pass root_url through functions #179 * This is to avoid using get_maven_root repeatedly * Save versionless purl to importable_uris Signed-off-by: Jono Yang --- minecode/management/commands/import_queue.py | 9 +++++-- minecode/models.py | 3 ++- minecode/visitors/maven.py | 25 +++++++++++++------- 3 files changed, 25 insertions(+), 12 deletions(-) diff --git a/minecode/management/commands/import_queue.py b/minecode/management/commands/import_queue.py index 68dcd29f..1cec902e 100644 --- a/minecode/management/commands/import_queue.py +++ b/minecode/management/commands/import_queue.py @@ -121,8 +121,13 @@ def process_request(importable_uri): if response: data = requests.text - # TODO: determine namespace from initial traversal - namespace, name, _ = determine_namespace_name_version_from_url(uri) + purl = importable_uri.package_url + if purl: + package_url = PackageURL(purl) + namespace = package_url.namespace + name = package_url.name + else: + namespace, name, _ = determine_namespace_name_version_from_url(uri) # Go into each version directory for link in collect_links_from_text(data, filter_only_directories): diff --git a/minecode/models.py b/minecode/models.py index 3efcb040..3a6f046e 100644 --- a/minecode/models.py +++ b/minecode/models.py @@ -936,7 +936,7 @@ def save(self, *args, **kwargs): # TODO: Use the QuerySet.as_manager() for more flexibility and chaining. class ImportableURIManager(models.Manager): - def insert(self, uri, data, **extra_fields): + def insert(self, uri, data, package_url, **extra_fields): """ Create and return a new ImportableURI Return None if the insertion failed when the same URI exists with the same versions to be collected @@ -954,6 +954,7 @@ def insert(self, uri, data, **extra_fields): importable_uri = self.create( uri=uri, data=data, + package_url=package_url, **extra_fields ) return importable_uri diff --git a/minecode/visitors/maven.py b/minecode/visitors/maven.py index f4897c7c..1ceab126 100644 --- a/minecode/visitors/maven.py +++ b/minecode/visitors/maven.py @@ -569,7 +569,7 @@ def get_maven_root(url): return None -def determine_namespace_name_version_from_url(url): +def determine_namespace_name_version_from_url(url, root_url=None): """ Return a 3-tuple containing strings of a Package namespace, name, and version, determined from `url`, where `url` points to namespace, package, @@ -580,9 +580,10 @@ def determine_namespace_name_version_from_url(url): >>> determine_namespace_name_version_from_url('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/') ('net.shibboleth', 'parent', '7.11.0') """ - root_url = get_maven_root(url) if not root_url: - raise Exception(f'Error: not a Maven repository: {url}') + root_url = get_maven_root(url) + if not root_url: + raise Exception(f'Error: not a Maven repository: {url}') _, remaining_path_segments = url.split(root_url) remaining_path_segments = remaining_path_segments.split('/') @@ -606,7 +607,7 @@ def determine_namespace_name_version_from_url(url): return namespace, package_name, package_version -def add_to_import_queue(url): +def add_to_import_queue(url, root_url): """ Create ImportableURI for the Maven repo package page at `url`. """ @@ -615,7 +616,13 @@ def add_to_import_queue(url): response = requests.get(url) if response: data = response.text - importable_uri = ImportableURI.objects.insert(url, data) + namespace, name, _ = determine_namespace_name_version_from_url(url, root_url) + purl = PackageURL( + type='maven', + namespace=namespace, + name=name, + ) + importable_uri = ImportableURI.objects.insert(url, data, purl) if importable_uri: logger.info(f'Inserted {url} into ImportableURI queue') @@ -706,16 +713,16 @@ def get_artifact_links(url): return directory_links -def crawl_to_package(url): +def crawl_to_package(url, root_url): """ Given a maven repo `url`, """ if is_package_page(url): - add_to_import_queue(url) + add_to_import_queue(url, root_url) return for link in get_directory_links(url): - crawl_to_package(link) + crawl_to_package(link, root_url) def crawl_maven_repo_from_root(root_url): @@ -723,7 +730,7 @@ def crawl_maven_repo_from_root(root_url): Given the `url` to a maven root, traverse the repo depth-first and add packages to the import queue. """ - crawl_to_package(root_url) + crawl_to_package(root_url, root_url) def get_artifact_sha1(artifact_url): From ebd0eb6f51bbd4f6785699c09d47bd7240e487b8 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Tue, 19 Sep 2023 18:09:15 -0700 Subject: [PATCH 05/19] Create maven_crawler command #179 Signed-off-by: Jono Yang --- minecode/management/commands/maven_crawler.py | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 minecode/management/commands/maven_crawler.py diff --git a/minecode/management/commands/maven_crawler.py b/minecode/management/commands/maven_crawler.py new file mode 100644 index 00000000..140c6dc5 --- /dev/null +++ b/minecode/management/commands/maven_crawler.py @@ -0,0 +1,56 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import logging +import signal +import sys +import time + +from minecode.management.commands import VerboseCommand +from minecode.visitors.maven import crawl_maven_repo_from_root + + +logger = logging.getLogger(__name__) +logging.basicConfig(stream=sys.stdout) +logger.setLevel(logging.INFO) + +TRACE = False +if TRACE: + logger.setLevel(logging.DEBUG) + + +MUST_STOP = False + + +def stop_handler(*args, **kwargs): + """ + Signal handler to set global variable to True. + """ + global MUST_STOP + MUST_STOP = True + + +signal.signal(signal.SIGTERM, stop_handler) + + +class Command(VerboseCommand): + help = 'Run a Package request queue.' + + def handle(self, *args, **options): + + global MUST_STOP + + while True: + if MUST_STOP: + logger.info('Graceful exit of the crawler') + break + + maven_root_url = 'https://repo.maven.apache.org/maven2' + crawl_maven_repo_from_root(root_url=maven_root_url) + From f51ca3135eafc378b9f70def5d97b1286e9898c0 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Thu, 21 Sep 2023 18:29:04 -0700 Subject: [PATCH 06/19] Use PackageURL.from_string #179 Signed-off-by: Jono Yang --- minecode/management/commands/import_queue.py | 2 +- minecode/management/commands/maven_crawler.py | 29 ++----------------- 2 files changed, 3 insertions(+), 28 deletions(-) diff --git a/minecode/management/commands/import_queue.py b/minecode/management/commands/import_queue.py index 1cec902e..3f27d28c 100644 --- a/minecode/management/commands/import_queue.py +++ b/minecode/management/commands/import_queue.py @@ -123,7 +123,7 @@ def process_request(importable_uri): purl = importable_uri.package_url if purl: - package_url = PackageURL(purl) + package_url = PackageURL.from_string(purl) namespace = package_url.namespace name = package_url.name else: diff --git a/minecode/management/commands/maven_crawler.py b/minecode/management/commands/maven_crawler.py index 140c6dc5..30c8f360 100644 --- a/minecode/management/commands/maven_crawler.py +++ b/minecode/management/commands/maven_crawler.py @@ -8,9 +8,7 @@ # import logging -import signal import sys -import time from minecode.management.commands import VerboseCommand from minecode.visitors.maven import crawl_maven_repo_from_root @@ -25,32 +23,9 @@ logger.setLevel(logging.DEBUG) -MUST_STOP = False - - -def stop_handler(*args, **kwargs): - """ - Signal handler to set global variable to True. - """ - global MUST_STOP - MUST_STOP = True - - -signal.signal(signal.SIGTERM, stop_handler) - - class Command(VerboseCommand): help = 'Run a Package request queue.' def handle(self, *args, **options): - - global MUST_STOP - - while True: - if MUST_STOP: - logger.info('Graceful exit of the crawler') - break - - maven_root_url = 'https://repo.maven.apache.org/maven2' - crawl_maven_repo_from_root(root_url=maven_root_url) - + maven_root_url = 'https://repo.maven.apache.org/maven2' + crawl_maven_repo_from_root(root_url=maven_root_url) From 3e58ff0e29b6e85ac5d0e52f2efc8ffb59265194 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Wed, 27 Sep 2023 13:19:06 -0700 Subject: [PATCH 07/19] Collect timestamps from pages #179 Signed-off-by: Jono Yang --- minecode/management/commands/import_queue.py | 8 +- minecode/visitors/maven.py | 86 +++++++++++++++++++- 2 files changed, 90 insertions(+), 4 deletions(-) diff --git a/minecode/management/commands/import_queue.py b/minecode/management/commands/import_queue.py index 3f27d28c..4e814ae6 100644 --- a/minecode/management/commands/import_queue.py +++ b/minecode/management/commands/import_queue.py @@ -16,12 +16,13 @@ from django.db import transaction from django.utils import timezone +from django.utils.dateparse import parse_datetime from packageurl import PackageURL from minecode.management.commands import get_error_message from minecode.management.commands import VerboseCommand from minecode.models import ImportableURI -from minecode.visitors.maven import get_artifact_links +from minecode.visitors.maven import get_artifact_links2 from minecode.visitors.maven import get_classifier_from_artifact_url from minecode.visitors.maven import collect_links_from_text from minecode.visitors.maven import filter_only_directories @@ -133,12 +134,14 @@ def process_request(importable_uri): for link in collect_links_from_text(data, filter_only_directories): version = link.rstrip('/') version_page_url = f'{uri}/{version}' - for artifact_link in get_artifact_links(version_page_url): + timestamps_by_artifact_links = get_artifact_links2(version_page_url) + for artifact_link, timestamp in timestamps_by_artifact_links.items(): sha1 = get_artifact_sha1(artifact_link) classifier = get_classifier_from_artifact_url(artifact_link, version_page_url, name, version) qualifiers = None if classifier: qualifiers = f'classifier={classifier}' + release_date = parse_datetime(timestamp) package_data = PackageData( type='maven', namespace=namespace, @@ -147,6 +150,7 @@ def process_request(importable_uri): qualifiers=qualifiers, download_url=artifact_link, sha1=sha1, + release_date=release_date, ) package, created, merged, map_error = merge_or_create_package( scanned_package=package_data, diff --git a/minecode/visitors/maven.py b/minecode/visitors/maven.py index 1ceab126..c556a8bc 100644 --- a/minecode/visitors/maven.py +++ b/minecode/visitors/maven.py @@ -455,7 +455,8 @@ def process_request(purl_str): return error -collect_links = re.compile('href="([^"]+)"').findall +collect_links = re.compile(r'href="([^"]+)"').findall +collect_artifact_timestamps = re.compile(r'\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}').findall def check_if_file_name_is_linked_on_page(file_name, links, **kwargs): @@ -631,6 +632,14 @@ def filter_only_directories(links): return [l for l in links if l != '../' and l.endswith('/')] +def filter_only_directories2(timestamps_by_links): + timestamps_by_links_filtered = {} + for link, timestamp in timestamps_by_links.items(): + if link != '../' and link.endswith('/'): + timestamps_by_links_filtered[link] = timestamp + return timestamps_by_links_filtered + + valid_artifact_extensions = [ 'ejb3', 'ear', @@ -659,6 +668,15 @@ def filter_for_artifacts(links): return artifacts +def filter_for_artifacts2(timestamps_by_links): + timestamps_by_links_filtered = {} + for link, timestamp in timestamps_by_links.items(): + for ext in valid_artifact_extensions: + if link.endswith(ext): + timestamps_by_links_filtered[link] = timestamp + return timestamps_by_links_filtered + + def collect_links_from_text(text, filter): """ Return a list of link locations, given HTML `text` content, that is filtered @@ -669,6 +687,25 @@ def collect_links_from_text(text, filter): return links +def collect_links_from_text2(text, filter): + """ + Return a list of link locations, given HTML `text` content, that is filtered + using `filter`. + """ + links = collect_links(text) + timestamps = collect_artifact_timestamps(text) + timestamps_by_links = {} + for i, link in enumerate(links): + if link == '../': + timestamp = '' + else: + timestamp = timestamps[i-1] + timestamps_by_links[link] = timestamp + + timestamps_by_links = filter(timestamps_by_links=timestamps_by_links) + return timestamps_by_links + + def create_absolute_urls_for_links(text, url, filter): """ Given the `text` contents from `url`, return a list of absolute URLs to @@ -683,6 +720,21 @@ def create_absolute_urls_for_links(text, url, filter): return absolute_links +def create_absolute_urls_for_links2(text, url, filter): + """ + Given the `text` contents from `url`, return a list of absolute URLs to + links from `url` that are filtered by `checker`. + """ + timestamps_by_absolute_links = {} + url = url.rstrip('/') + timestamps_by_links = collect_links_from_text2(text, filter) + for link, timestamp in timestamps_by_links.items(): + if not link.startswith(url): + link = f'{url}/{link}' + timestamps_by_absolute_links[link] = timestamp + return timestamps_by_absolute_links + + def get_directory_links(url): """ Return a list of absolute directory URLs of the hyperlinks from `url` @@ -698,6 +750,21 @@ def get_directory_links(url): return directory_links +def get_directory_links2(url): + """ + Return a list of absolute directory URLs of the hyperlinks from `url` + """ + timestamps_by_directory_links = {} + response = requests.get(url) + if response: + timestamps_by_directory_links = create_absolute_urls_for_links2( + response.text, + url=url, + filter=filter_only_directories2 + ) + return timestamps_by_directory_links + + def get_artifact_links(url): """ Return a list of absolute directory URLs of the hyperlinks from `url` @@ -713,6 +780,21 @@ def get_artifact_links(url): return directory_links +def get_artifact_links2(url): + """ + Return a list of absolute directory URLs of the hyperlinks from `url` + """ + timestamps_by_artifact_links = [] + response = requests.get(url) + if response: + timestamps_by_artifact_links = create_absolute_urls_for_links2( + response.text, + url=url, + filter=filter_for_artifacts2 + ) + return timestamps_by_artifact_links + + def crawl_to_package(url, root_url): """ Given a maven repo `url`, @@ -764,7 +846,7 @@ def get_classifier_from_artifact_url(artifact_url, package_version_page_url, pac remaining_url_portions = remaining_url_portion.split('.') if remaining_url_portions and remaining_url_portions[0]: # '-onejar' - classifier = remaining_url_portion[0] + classifier = remaining_url_portions[0] if classifier.startswith('-'): # 'onejar' classifier = classifier[1:] From ab293b472ff8cc05f030f9c4caa2a07ea465d5c6 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Wed, 27 Sep 2023 13:46:03 -0700 Subject: [PATCH 08/19] Remove old code #179 Signed-off-by: Jono Yang --- minecode/management/commands/import_queue.py | 9 ++- minecode/visitors/maven.py | 85 +++----------------- 2 files changed, 14 insertions(+), 80 deletions(-) diff --git a/minecode/management/commands/import_queue.py b/minecode/management/commands/import_queue.py index 4e814ae6..6ee3a456 100644 --- a/minecode/management/commands/import_queue.py +++ b/minecode/management/commands/import_queue.py @@ -22,7 +22,7 @@ from minecode.management.commands import get_error_message from minecode.management.commands import VerboseCommand from minecode.models import ImportableURI -from minecode.visitors.maven import get_artifact_links2 +from minecode.visitors.maven import get_artifact_links from minecode.visitors.maven import get_classifier_from_artifact_url from minecode.visitors.maven import collect_links_from_text from minecode.visitors.maven import filter_only_directories @@ -130,11 +130,12 @@ def process_request(importable_uri): else: namespace, name, _ = determine_namespace_name_version_from_url(uri) + timestamps_by_directory_links = collect_links_from_text(data, filter_only_directories) # Go into each version directory - for link in collect_links_from_text(data, filter_only_directories): - version = link.rstrip('/') + for directory_link in timestamps_by_directory_links.keys(): + version = directory_link.rstrip('/') version_page_url = f'{uri}/{version}' - timestamps_by_artifact_links = get_artifact_links2(version_page_url) + timestamps_by_artifact_links = get_artifact_links(version_page_url) for artifact_link, timestamp in timestamps_by_artifact_links.items(): sha1 = get_artifact_sha1(artifact_link) classifier = get_classifier_from_artifact_url(artifact_link, version_page_url, name, version) diff --git a/minecode/visitors/maven.py b/minecode/visitors/maven.py index c556a8bc..f7d5a6db 100644 --- a/minecode/visitors/maven.py +++ b/minecode/visitors/maven.py @@ -456,7 +456,7 @@ def process_request(purl_str): collect_links = re.compile(r'href="([^"]+)"').findall -collect_artifact_timestamps = re.compile(r'\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}').findall +collect_artifact_timestamps = re.compile(r'(-|\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2})').findall def check_if_file_name_is_linked_on_page(file_name, links, **kwargs): @@ -628,11 +628,7 @@ def add_to_import_queue(url, root_url): logger.info(f'Inserted {url} into ImportableURI queue') -def filter_only_directories(links): - return [l for l in links if l != '../' and l.endswith('/')] - - -def filter_only_directories2(timestamps_by_links): +def filter_only_directories(timestamps_by_links): timestamps_by_links_filtered = {} for link, timestamp in timestamps_by_links.items(): if link != '../' and link.endswith('/'): @@ -659,16 +655,7 @@ def filter_only_directories2(timestamps_by_links): ] -def filter_for_artifacts(links): - artifacts = [] - for l in links: - for ext in valid_artifact_extensions: - if l.endswith(ext): - artifacts.append(l) - return artifacts - - -def filter_for_artifacts2(timestamps_by_links): +def filter_for_artifacts(timestamps_by_links): timestamps_by_links_filtered = {} for link, timestamp in timestamps_by_links.items(): for ext in valid_artifact_extensions: @@ -678,16 +665,6 @@ def filter_for_artifacts2(timestamps_by_links): def collect_links_from_text(text, filter): - """ - Return a list of link locations, given HTML `text` content, that is filtered - using `filter`. - """ - links = collect_links(text) - links = filter(links=links) - return links - - -def collect_links_from_text2(text, filter): """ Return a list of link locations, given HTML `text` content, that is filtered using `filter`. @@ -696,7 +673,7 @@ def collect_links_from_text2(text, filter): timestamps = collect_artifact_timestamps(text) timestamps_by_links = {} for i, link in enumerate(links): - if link == '../': + if link.endswith('/') or not timestamps: timestamp = '' else: timestamp = timestamps[i-1] @@ -707,27 +684,13 @@ def collect_links_from_text2(text, filter): def create_absolute_urls_for_links(text, url, filter): - """ - Given the `text` contents from `url`, return a list of absolute URLs to - links from `url` that are filtered by `checker`. - """ - absolute_links = [] - url = url.rstrip('/') - for link in collect_links_from_text(text, filter): - if not link.startswith(url): - link = f'{url}/{link}' - absolute_links.append(link) - return absolute_links - - -def create_absolute_urls_for_links2(text, url, filter): """ Given the `text` contents from `url`, return a list of absolute URLs to links from `url` that are filtered by `checker`. """ timestamps_by_absolute_links = {} url = url.rstrip('/') - timestamps_by_links = collect_links_from_text2(text, filter) + timestamps_by_links = collect_links_from_text(text, filter) for link, timestamp in timestamps_by_links.items(): if not link.startswith(url): link = f'{url}/{link}' @@ -736,61 +699,31 @@ def create_absolute_urls_for_links2(text, url, filter): def get_directory_links(url): - """ - Return a list of absolute directory URLs of the hyperlinks from `url` - """ - directory_links = [] - response = requests.get(url) - if response: - directory_links = create_absolute_urls_for_links( - response.text, - url=url, - filter=filter_only_directories - ) - return directory_links - - -def get_directory_links2(url): """ Return a list of absolute directory URLs of the hyperlinks from `url` """ timestamps_by_directory_links = {} response = requests.get(url) if response: - timestamps_by_directory_links = create_absolute_urls_for_links2( + timestamps_by_directory_links = create_absolute_urls_for_links( response.text, url=url, - filter=filter_only_directories2 + filter=filter_only_directories ) return timestamps_by_directory_links def get_artifact_links(url): - """ - Return a list of absolute directory URLs of the hyperlinks from `url` - """ - directory_links = [] - response = requests.get(url) - if response: - directory_links = create_absolute_urls_for_links( - response.text, - url=url, - filter=filter_for_artifacts - ) - return directory_links - - -def get_artifact_links2(url): """ Return a list of absolute directory URLs of the hyperlinks from `url` """ timestamps_by_artifact_links = [] response = requests.get(url) if response: - timestamps_by_artifact_links = create_absolute_urls_for_links2( + timestamps_by_artifact_links = create_absolute_urls_for_links( response.text, url=url, - filter=filter_for_artifacts2 + filter=filter_for_artifacts ) return timestamps_by_artifact_links From 2e0f53367dd3b27d7a9a69b59a22eb547e243dda Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Wed, 27 Sep 2023 18:52:57 -0700 Subject: [PATCH 09/19] Update regex #179 * Get links and timestamps at the same time * Create command that gets release_date for maven packages Signed-off-by: Jono Yang --- .../commands/get_maven_release_dates.py | 49 +++++++++++++++++++ minecode/management/commands/import_queue.py | 4 +- minecode/visitors/maven.py | 13 +++-- 3 files changed, 57 insertions(+), 9 deletions(-) create mode 100644 minecode/management/commands/get_maven_release_dates.py diff --git a/minecode/management/commands/get_maven_release_dates.py b/minecode/management/commands/get_maven_release_dates.py new file mode 100644 index 00000000..d1cce890 --- /dev/null +++ b/minecode/management/commands/get_maven_release_dates.py @@ -0,0 +1,49 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +from dateutil.parser import parse as dateutil_parse +import logging +import sys + +import requests + +from minecode.management.commands import VerboseCommand +from packagedb.models import Package +from minecode.visitors.maven import collect_links_from_text +from minecode.visitors.maven import filter_for_artifacts +from os.path import dirname + +logger = logging.getLogger(__name__) +logging.basicConfig(stream=sys.stdout) +logger.setLevel(logging.INFO) + +TRACE = False +if TRACE: + logger.setLevel(logging.DEBUG) + + +class Command(VerboseCommand): + help = '' + + def handle(self, *args, **options): + maven_packages = Package.objects.filter(type='maven', release_date=None) + for package in maven_packages: + download_url = package.download_url + package_version_page_url = dirname(download_url) + filename = download_url.rsplit('/')[-1] + response = requests.get(package_version_page_url) + if response: + timestamps_by_links = collect_links_from_text(response.text, filter=filter_for_artifacts) + timestamp = timestamps_by_links.get(filename) + if not timestamp: + continue + timestamp = dateutil_parse(timestamp) + package.release_date = timestamp + # TODO: do batch update + package.save() diff --git a/minecode/management/commands/import_queue.py b/minecode/management/commands/import_queue.py index 6ee3a456..55862921 100644 --- a/minecode/management/commands/import_queue.py +++ b/minecode/management/commands/import_queue.py @@ -7,6 +7,7 @@ # See https://aboutcode.org for more information about nexB OSS projects. # +from dateutil.parser import parse as dateutil_parse import logging import signal import sys @@ -16,7 +17,6 @@ from django.db import transaction from django.utils import timezone -from django.utils.dateparse import parse_datetime from packageurl import PackageURL from minecode.management.commands import get_error_message @@ -142,7 +142,7 @@ def process_request(importable_uri): qualifiers = None if classifier: qualifiers = f'classifier={classifier}' - release_date = parse_datetime(timestamp) + release_date = dateutil_parse(timestamp) package_data = PackageData( type='maven', namespace=namespace, diff --git a/minecode/visitors/maven.py b/minecode/visitors/maven.py index f7d5a6db..ac76f715 100644 --- a/minecode/visitors/maven.py +++ b/minecode/visitors/maven.py @@ -456,7 +456,9 @@ def process_request(purl_str): collect_links = re.compile(r'href="([^"]+)"').findall -collect_artifact_timestamps = re.compile(r'(-|\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2})').findall +collect_links_and_artifact_timestamps = re.compile( + r'[^"]+\s+(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}|-)' +).findall def check_if_file_name_is_linked_on_page(file_name, links, **kwargs): @@ -669,14 +671,11 @@ def collect_links_from_text(text, filter): Return a list of link locations, given HTML `text` content, that is filtered using `filter`. """ - links = collect_links(text) - timestamps = collect_artifact_timestamps(text) + links_and_timestamps = collect_links_and_artifact_timestamps(text) timestamps_by_links = {} - for i, link in enumerate(links): - if link.endswith('/') or not timestamps: + for link, timestamp in links_and_timestamps: + if timestamp == '-': timestamp = '' - else: - timestamp = timestamps[i-1] timestamps_by_links[link] = timestamp timestamps_by_links = filter(timestamps_by_links=timestamps_by_links) From af67e2ca06c5bc7cc7eb0800a6b0e53de14a8d2e Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Thu, 28 Sep 2023 18:26:53 -0700 Subject: [PATCH 10/19] Create tests for new functions #179 Signed-off-by: Jono Yang --- minecode/tests/test_maven.py | 312 +++++++++++++++++++++++++++++++++++ minecode/visitors/maven.py | 16 +- 2 files changed, 324 insertions(+), 4 deletions(-) diff --git a/minecode/tests/test_maven.py b/minecode/tests/test_maven.py index 8584f49d..2375f20b 100644 --- a/minecode/tests/test_maven.py +++ b/minecode/tests/test_maven.py @@ -868,3 +868,315 @@ def test_get_merged_ancestor_package_from_maven_package(self, get_pom_text_mock, merged_package = maven_visitor.get_merged_ancestor_package_from_maven_package(package=db_package) expected_loc = self.get_test_loc('maven/pom/pulsar-client-merged-ancestor-package.json') self.check_expected_results(merged_package.to_dict(), expected_loc, regen=regen) + + +class MavenCrawlerFunctionsTest(JsonBasedTesting, DjangoTestCase): + test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + + def test_check_if_file_name_is_linked_on_page(self): + links = ['foo/', 'bar/', 'baz/'] + self.assertTrue( + maven_visitor.check_if_file_name_is_linked_on_page('foo/', links) + ) + self.assertFalse( + maven_visitor.check_if_file_name_is_linked_on_page('qux/', links) + ) + + def test_check_if_page_has_pom_files(self): + links1 = ['foo/', 'bar.jar', 'bar.pom'] + links2 = ['foo/', 'bar.jar'] + self.assertTrue(maven_visitor.check_if_page_has_pom_files(links1)) + self.assertFalse(maven_visitor.check_if_page_has_pom_files(links2)) + + def test_check_if_page_has_directories(self): + links1 = ['foo/', 'bar/', 'baz/'] + links2 = ['../', 'bar.pom', 'bar.jar'] + self.assertTrue(maven_visitor.check_if_page_has_directories(links1)) + self.assertFalse(maven_visitor.check_if_page_has_directories(links2)) + + def test_check_if_package_version_page(self): + links1 = ['../', 'bar.pom', 'bar.jar'] + links2 = ['../', 'foo/', 'bar/', 'baz/'] + self.assertTrue(maven_visitor.check_if_package_version_page(links1)) + self.assertFalse(maven_visitor.check_if_package_version_page(links2)) + + def test_check_if_package_page(self): + links1 = ['../', 'maven-metadata.xml'] + links2 = ['../', 'bar.pom', 'bar.jar'] + self.assertTrue(maven_visitor.check_if_package_page(links1)) + self.assertFalse(maven_visitor.check_if_package_page(links2)) + + def test_check_if_maven_root(self): + links1 = ['../', 'archetype-catalog.xml'] + links2 = ['../', 'bar.pom', 'bar.jar'] + self.assertTrue(maven_visitor.check_if_maven_root(links1)) + self.assertFalse(maven_visitor.check_if_maven_root(links2)) + + @mock.patch('requests.get') + def test_check_on_page(self, mock_request_get): + checker = maven_visitor.check_if_page_has_pom_files + mock_request_get.return_value.ok = True + mock_request_get.return_value.text = 'parent-7.11.0.pom' + self.assertTrue(maven_visitor.check_on_page('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/', checker)) + + @mock.patch('requests.get') + def test_is_maven_root(self, mock_request_get): + mock_request_get.return_value.ok = True + mock_request_get.return_value.text = 'archetype-catalog.xml' + self.assertTrue(maven_visitor.is_maven_root('https://repo1.maven.org/maven2/')) + + @mock.patch('requests.get') + def test_is_package_page(self, mock_request_get): + mock_request_get.return_value.ok = True + mock_request_get.return_value.text = 'maven-metadata.xml' + self.assertTrue(maven_visitor.is_package_page('https://repo1.maven.org/maven2/xml-apis/xml-apis/')) + + @mock.patch('requests.get') + def test_is_package_version_page(self, mock_request_get): + mock_request_get.return_value.ok = True + mock_request_get.return_value.text = ''' + ../ + parent-7.11.0.pom + ''' + self.assertTrue(maven_visitor.is_package_version_page('https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/')) + + def test_url_parts(self): + url = 'https://example.com/foo/bar/baz.jar' + scheme, netloc, path_segments = maven_visitor.url_parts(url) + self.assertEqual('https', scheme) + self.assertEqual('example.com', netloc) + self.assertEquals(['foo', 'bar', 'baz.jar'], path_segments) + + def test_create_url(self): + scheme = 'https' + netloc = 'example.com' + path_segments = ['foo', 'bar', 'baz.jar'] + url = 'https://example.com/foo/bar/baz.jar' + self.assertEqual( + url, + maven_visitor.create_url(scheme, netloc, path_segments) + ) + + @mock.patch('requests.get') + def test_get_maven_root(self, mock_request_get): + mock_request_get.return_value.ok = True + mock_request_get.return_value.text = 'archetype-catalog.xml' + self.assertEqual( + 'https://repo1.maven.org/maven2', + maven_visitor.get_maven_root('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/') + ) + + @mock.patch('requests.get') + def test_determine_namespace_name_version_from_url(self, mock_request_get): + url = 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2' + root_url = 'https://repo1.maven.org/maven2' + + package_page_text = ''' + 1.0.b2/ + 2005-09-20 05:53 - + maven-metadata.xml + 2012-06-26 17:01 567 + ''' + package_page = mock.Mock(ok=True, text=package_page_text) + + package_version_page_text = ''' + ../ - + xml-apis-1.0.b2.pom + 2005-09-20 05:53 2249 + ''' + package_version_page = mock.Mock(ok=True, text=package_version_page_text) + mock_request_get.side_effect = [ + mock.Mock(ok=True, text=''), + mock.Mock(ok=True, text=''), + package_page, + mock.Mock(ok=True, text=''), + package_version_page + ] + + namespace, package_name, package_version = maven_visitor.determine_namespace_name_version_from_url(url, root_url) + self.assertEqual('xml-apis', namespace) + self.assertEqual('xml-apis', package_name) + self.assertEqual('1.0.b2', package_version) + + @mock.patch('requests.get') + def test_add_to_import_queue(self, mock_request_get): + from minecode.models import ImportableURI + + url = 'https://repo1.maven.org/maven2/xml-apis/xml-apis/' + root_url = 'https://repo1.maven.org/maven2' + + package_page_text = ''' + 1.0.b2/ + 2005-09-20 05:53 - + maven-metadata.xml + 2012-06-26 17:01 567 + ''' + package_page = mock.Mock(ok=True, text=package_page_text) + + package_version_page_text = ''' + ../ - + xml-apis-1.0.b2.pom + 2005-09-20 05:53 2249 + ''' + package_version_page = mock.Mock(ok=True, text=package_version_page_text) + mock_request_get.side_effect = [ + package_page, + mock.Mock(ok=True, text=''), + mock.Mock(ok=True, text=''), + package_page, + mock.Mock(ok=True, text=''), + package_version_page + ] + + self.assertEqual(0, ImportableURI.objects.all().count()) + maven_visitor.add_to_import_queue(url, root_url ) + self.assertEqual(1, ImportableURI.objects.all().count()) + importable_uri = ImportableURI.objects.get(uri=url) + self.assertEqual('pkg:maven/xml-apis/xml-apis', importable_uri.package_url) + + def test_filter_only_directories(self): + timestamps_by_links = { + '../': '-', + 'foo/': '-', + 'foo.pom': '2023-09-28', + } + expected = { + 'foo/': '-', + } + self.assertEqual( + expected, + maven_visitor.filter_only_directories(timestamps_by_links) + ) + + def test_filter_for_artifacts(self): + timestamps_by_links = { + '../': '2023-09-28', + 'foo.pom': '2023-09-28', + 'foo.ejb3': '2023-09-28', + 'foo.ear': '2023-09-28', + 'foo.aar': '2023-09-28', + 'foo.apk': '2023-09-28', + 'foo.gem': '2023-09-28', + 'foo.jar': '2023-09-28', + 'foo.nar': '2023-09-28', + 'foo.so': '2023-09-28', + 'foo.swc': '2023-09-28', + 'foo.tar': '2023-09-28', + 'foo.tar.gz': '2023-09-28', + 'foo.war': '2023-09-28', + 'foo.xar': '2023-09-28', + 'foo.zip': '2023-09-28', + } + expected = { + 'foo.ejb3': '2023-09-28', + 'foo.ear': '2023-09-28', + 'foo.aar': '2023-09-28', + 'foo.apk': '2023-09-28', + 'foo.gem': '2023-09-28', + 'foo.jar': '2023-09-28', + 'foo.nar': '2023-09-28', + 'foo.so': '2023-09-28', + 'foo.swc': '2023-09-28', + 'foo.tar': '2023-09-28', + 'foo.tar.gz': '2023-09-28', + 'foo.war': '2023-09-28', + 'foo.xar': '2023-09-28', + 'foo.zip': '2023-09-28', + } + self.assertEqual(expected, maven_visitor.filter_for_artifacts(timestamps_by_links)) + + def test_collect_links_from_text(self): + filter = maven_visitor.filter_only_directories + text = ''' + ../ + 1.0.b2/ + 2005-09-20 05:53 - + 1.2.01/ + 2010-02-03 21:05 - + ''' + expected = { + '1.0.b2/': '2005-09-20 05:53', + '1.2.01/': '2010-02-03 21:05' + } + self.assertEqual( + expected, + maven_visitor.collect_links_from_text(text, filter=filter) + ) + + def test_create_absolute_urls_for_links(self): + filter = maven_visitor.filter_only_directories + text = ''' + ../ + 1.0.b2/ + 2005-09-20 05:53 - + 1.2.01/ + 2010-02-03 21:05 - + ''' + url = 'https://repo1.maven.org/maven2/xml-apis/xml-apis/' + expected = { + 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/': '2005-09-20 05:53', + 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.2.01/': '2010-02-03 21:05' + } + self.assertEqual( + expected, + maven_visitor.create_absolute_urls_for_links(text, url, filter=filter) + ) + + @mock.patch('requests.get') + def test_get_directory_links(self, mock_request_get): + mock_request_get.return_value.ok = True + mock_request_get.return_value.text = ''' + ../ + 1.0.b2/ + 2005-09-20 05:53 - + 1.2.01/ + 2010-02-03 21:05 - + ''' + url = 'https://repo1.maven.org/maven2/xml-apis/xml-apis/' + expected = { + 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/': '2005-09-20 05:53', + 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.2.01/': '2010-02-03 21:05' + } + self.assertEqual(expected, maven_visitor.get_directory_links(url)) + + @mock.patch('requests.get') + def test_get_artifact_links(self, mock_request_get): + mock_request_get.return_value.ok = True + mock_request_get.return_value.text = ''' + ../ + xml-apis-1.0.b2.jar + 2005-09-20 05:53 109318 + xml-apis-1.0.b2.pom + 2005-09-20 05:53 2249 + ''' + url = 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/' + expected = { + 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/xml-apis-1.0.b2.jar': '2005-09-20 05:53', + } + self.assertEqual(expected, maven_visitor.get_artifact_links(url)) + + def test_crawl_to_package(self): + pass + + def test_crawl_maven_repo_from_root(self): + pass + + @mock.patch('requests.get') + def test_get_artifact_sha1(self, mock_request_get): + sha1 = '3136ca936f64c9d68529f048c2618bd356bf85c9' + mock_request_get.return_value.ok = True + mock_request_get.return_value.text = sha1 + self.assertEqual(sha1, maven_visitor.get_artifact_sha1('https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/xml-apis-1.0.b2.jar.sha1')) + + def test_get_classifier_from_artifact_url(self): + artifact_url = 'https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0-onejar.jar' + package_version_page_url = 'https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/' + package_name = 'livereload-jvm' + package_version = '0.2.0' + classifier = maven_visitor.get_classifier_from_artifact_url( + artifact_url, + package_version_page_url, + package_name, + package_version + ) + self.assertEqual('onejar', classifier) diff --git a/minecode/visitors/maven.py b/minecode/visitors/maven.py index ac76f715..89f7f722 100644 --- a/minecode/visitors/maven.py +++ b/minecode/visitors/maven.py @@ -631,6 +631,9 @@ def add_to_import_queue(url, root_url): def filter_only_directories(timestamps_by_links): + """ + Given a mapping of `timestamps_by_links`, where the links are directory names (which end with `/`), + """ timestamps_by_links_filtered = {} for link, timestamp in timestamps_by_links.items(): if link != '../' and link.endswith('/'): @@ -658,6 +661,11 @@ def filter_only_directories(timestamps_by_links): def filter_for_artifacts(timestamps_by_links): + """ + Given a mapping of `timestamps_by_links`, where the links are the filenames + of Maven artifacts, return a mapping of filenames whose extension is in + `valid_artifact_extensions` and their timestamps. + """ timestamps_by_links_filtered = {} for link, timestamp in timestamps_by_links.items(): for ext in valid_artifact_extensions: @@ -668,8 +676,8 @@ def filter_for_artifacts(timestamps_by_links): def collect_links_from_text(text, filter): """ - Return a list of link locations, given HTML `text` content, that is filtered - using `filter`. + Return a mapping of link locations and their timestamps, given HTML `text` + content, that is filtered using `filter`. """ links_and_timestamps = collect_links_and_artifact_timestamps(text) timestamps_by_links = {} @@ -684,8 +692,8 @@ def collect_links_from_text(text, filter): def create_absolute_urls_for_links(text, url, filter): """ - Given the `text` contents from `url`, return a list of absolute URLs to - links from `url` that are filtered by `checker`. + Given the `text` contents from `url`, return a mapping of absolute URLs to + links from `url` and their timestamps, that is then filtered by `filter`. """ timestamps_by_absolute_links = {} url = url.rstrip('/') From 6443f18a59f4bdcfa37803f286a26284e590cd67 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Fri, 29 Sep 2023 10:55:32 -0700 Subject: [PATCH 11/19] Bulk update Packages #179 * Add logging messages Signed-off-by: Jono Yang --- .../commands/get_maven_release_dates.py | 34 +++++++++++++++---- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/minecode/management/commands/get_maven_release_dates.py b/minecode/management/commands/get_maven_release_dates.py index d1cce890..f122a8c3 100644 --- a/minecode/management/commands/get_maven_release_dates.py +++ b/minecode/management/commands/get_maven_release_dates.py @@ -8,16 +8,17 @@ # from dateutil.parser import parse as dateutil_parse +from os.path import dirname import logging import sys import requests from minecode.management.commands import VerboseCommand -from packagedb.models import Package from minecode.visitors.maven import collect_links_from_text from minecode.visitors.maven import filter_for_artifacts -from os.path import dirname +from packagedb.models import Package + logger = logging.getLogger(__name__) logging.basicConfig(stream=sys.stdout) @@ -29,12 +30,18 @@ class Command(VerboseCommand): - help = '' + help = 'Get and set release_date for Maven Packages' def handle(self, *args, **options): - maven_packages = Package.objects.filter(type='maven', release_date=None) - for package in maven_packages: + queryset = Package.objects.filter(type='maven', release_date=None) + object_count = queryset.count() + chunk_size = 2000 + iterator = queryset.iterator(chunk_size=chunk_size) + unsaved_objects = [] + for index, package in enumerate(iterator, start=1): download_url = package.download_url + package_url = package.package_url + logger.info(f'Updating release_date for package {package_url} ({download_url})') package_version_page_url = dirname(download_url) filename = download_url.rsplit('/')[-1] response = requests.get(package_version_page_url) @@ -42,8 +49,21 @@ def handle(self, *args, **options): timestamps_by_links = collect_links_from_text(response.text, filter=filter_for_artifacts) timestamp = timestamps_by_links.get(filename) if not timestamp: + logger.info(f'\tCould not get release_date for package {package_url} ({download_url})') continue timestamp = dateutil_parse(timestamp) package.release_date = timestamp - # TODO: do batch update - package.save() + unsaved_objects.append(package) + logger.info(f'\t{package_url} ({download_url}) release_date has been updated to {timestamp}') + else: + logger.info(f'\t{package_url} not updated: error encountered when visiting {package_version_page_url}') + if not (index % chunk_size) and unsaved_objects: + logger.info(f'{index:,} / {object_count:,} Packages processed') + + logger.info('Updating Package objects...') + updated_packages_count = Package.objects.bulk_update( + objs=unsaved_objects, + fields=['release_date'], + batch_size=1000, + ) + logger.info(f'Updated {updated_packages_count} Package objects') From 92117dc6bd1a3d28a11075a9999982fed76723ab Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Fri, 29 Sep 2023 13:14:39 -0700 Subject: [PATCH 12/19] Update regex #179 * Only update release_date for packages from maven.org Signed-off-by: Jono Yang --- minecode/management/commands/get_maven_release_dates.py | 8 +++++++- minecode/visitors/maven.py | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/minecode/management/commands/get_maven_release_dates.py b/minecode/management/commands/get_maven_release_dates.py index f122a8c3..c120b67e 100644 --- a/minecode/management/commands/get_maven_release_dates.py +++ b/minecode/management/commands/get_maven_release_dates.py @@ -33,11 +33,17 @@ class Command(VerboseCommand): help = 'Get and set release_date for Maven Packages' def handle(self, *args, **options): - queryset = Package.objects.filter(type='maven', release_date=None) + queryset = Package.objects.filter( + type='maven', + release_date=None, + download_url__startswith='https://repo1.maven.org/maven2' + ) object_count = queryset.count() chunk_size = 2000 iterator = queryset.iterator(chunk_size=chunk_size) unsaved_objects = [] + + logger.info(f'Updating release_date for {object_count} packages') for index, package in enumerate(iterator, start=1): download_url = package.download_url package_url = package.package_url diff --git a/minecode/visitors/maven.py b/minecode/visitors/maven.py index 89f7f722..55624772 100644 --- a/minecode/visitors/maven.py +++ b/minecode/visitors/maven.py @@ -457,7 +457,7 @@ def process_request(purl_str): collect_links = re.compile(r'href="([^"]+)"').findall collect_links_and_artifact_timestamps = re.compile( - r'[^"]+\s+(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}|-)' + r'\s+(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}|-)' ).findall From 1c1873fc7cb8c247402ace26d6ce095e65e9002c Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Fri, 29 Sep 2023 14:25:06 -0700 Subject: [PATCH 13/19] Update release_date to DateTimeField #179 Signed-off-by: Jono Yang --- .../0078_alter_package_release_date.py | 22 +++++++++++++++++++ packagedb/models.py | 4 ++-- 2 files changed, 24 insertions(+), 2 deletions(-) create mode 100644 packagedb/migrations/0078_alter_package_release_date.py diff --git a/packagedb/migrations/0078_alter_package_release_date.py b/packagedb/migrations/0078_alter_package_release_date.py new file mode 100644 index 00000000..b33739fa --- /dev/null +++ b/packagedb/migrations/0078_alter_package_release_date.py @@ -0,0 +1,22 @@ +# Generated by Django 4.1.2 on 2023-09-29 21:24 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("packagedb", "0077_remove_package_declared_license_expression_spdx_and_more"), + ] + + operations = [ + migrations.AlterField( + model_name="package", + name="release_date", + field=models.DateTimeField( + blank=True, + db_index=True, + help_text="The date and time that the package file was created, or when it was posted to its original download source.", + null=True, + ), + ), + ] diff --git a/packagedb/models.py b/packagedb/models.py index e4d7c58a..23588273 100644 --- a/packagedb/models.py +++ b/packagedb/models.py @@ -254,12 +254,12 @@ class AbstractPackage(models.Model): "By convention the first line should be a summary when available." ), ) - release_date = models.DateField( + release_date = models.DateTimeField( blank=True, null=True, db_index=True, help_text=_( - "The date that the package file was created, or when " + "The date and time that the package file was created, or when " "it was posted to its original download source." ), ) From e2da2aaf6ec34f2a8cc054248339574b84b21f75 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Tue, 3 Oct 2023 11:11:59 -0700 Subject: [PATCH 14/19] Return earliest Packages in filter_by_checksums Signed-off-by: Jono Yang --- packagedb/api.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/packagedb/api.py b/packagedb/api.py index d8d7f59f..3dd17d7e 100644 --- a/packagedb/api.py +++ b/packagedb/api.py @@ -9,7 +9,9 @@ import logging from django.core.exceptions import ValidationError +from django.db.models import OuterRef from django.db.models import Q +from django.db.models import Subquery from django_filters.rest_framework import FilterSet from django_filters.filters import Filter from django_filters.filters import OrderingFilter @@ -565,12 +567,20 @@ def filter_by_checksums(self, request, *args, **kwargs): lookups = Q() for field, value in data.items(): + # Subquery to get the ids of the Packages with the earliest release_date for each `field` + earliest_release_dates = Package.objects.filter( + **{field: OuterRef(field)} + ).order_by('release_date').values('id')[:1] + value = value or [] - # We create this intermediate dictionary so we can modify the field - # name to have __in at the end - d = {f'{field}__in': value} - lookups |= Q(**d) + lookups |= Q( + **{ + f'{field}__in': value, + 'id__in': Subquery(earliest_release_dates), + } + ) + # Query to get the full Package objects with the earliest release_date for each sha1 qs = Package.objects.filter(lookups) paginated_qs = self.paginate_queryset(qs) if enhance_package_data: @@ -803,7 +813,7 @@ def get_all_versions(purl: PackageURL): except InvalidVersion: logger.warning(f"Invalid version '{package_version.value}' for '{purl}'") pass - + return result From bb5cf1f7d6cd0c93e4cd5778e29e43e65e65eee5 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Mon, 9 Oct 2023 15:01:22 -0700 Subject: [PATCH 15/19] Fix issues found in packagedb migrations Signed-off-by: Jono Yang --- ...0047_add_search_vector_field_to_package.py | 28 +++++++++--- .../0059_compute_package_license_data.py | 43 ++++++++++++++++--- .../0062_compute_resource_license_data.py | 12 ++++-- .../migrations/0070_auto_20230706_0045.py | 19 ++++++-- 4 files changed, 84 insertions(+), 18 deletions(-) diff --git a/packagedb/migrations/0047_add_search_vector_field_to_package.py b/packagedb/migrations/0047_add_search_vector_field_to_package.py index c2687a27..9eccd785 100644 --- a/packagedb/migrations/0047_add_search_vector_field_to_package.py +++ b/packagedb/migrations/0047_add_search_vector_field_to_package.py @@ -1,6 +1,6 @@ # Generated by Django 3.1.5 on 2021-03-10 19:04 -import django.contrib.postgres.search +from django.contrib.postgres.search import SearchVector, SearchVectorField from django.db import migrations @@ -9,10 +9,26 @@ def populate_search_vector_field(apps, schema_editor): Data migration used to lowercase any purl field values that currently exist. """ Package = apps.get_model('packagedb', 'Package') - - for pkg in Package.objects.iterator(): - pkg.search_vector = search.SearchVector('namespace', 'name', 'version', 'download_url') - pkg.save() + resource_uris = Package.objects.iterator(chunk_size=5000) + updated = [] + for i, package in enumerate(resource_uris): + if not i % 5000: + Package.objects.bulk_update( + objs=updated, + fields=[ + 'search_vector', + ] + ) + updated = [] + package.search_vector = SearchVector('namespace', 'name', 'version', 'download_url') + updated.append(package) + if updated: + Package.objects.bulk_update( + objs=updated, + fields=[ + 'search_vector', + ] + ) class Migration(migrations.Migration): @@ -25,7 +41,7 @@ class Migration(migrations.Migration): migrations.AddField( model_name='package', name='search_vector', - field=django.contrib.postgres.search.SearchVectorField(null=True), + field=SearchVectorField(null=True), ), migrations.RunPython(populate_search_vector_field), ] diff --git a/packagedb/migrations/0059_compute_package_license_data.py b/packagedb/migrations/0059_compute_package_license_data.py index c57d14e9..109c4254 100644 --- a/packagedb/migrations/0059_compute_package_license_data.py +++ b/packagedb/migrations/0059_compute_package_license_data.py @@ -9,18 +9,51 @@ def compute_package_declared_license_expression_spdx(apps, schema_editor): Compute Package `declared_license_expression_spdx`, when missing, from `declared_license_expression`, when available. """ - from licensedcode.cache import build_spdx_license_expression + from licensedcode.cache import build_spdx_license_expression, InvalidLicenseKeyError + from packageurl import PackageURL Package = apps.get_model('packagedb', 'Package') packages = Package.objects.filter( ~Q(declared_license_expression="") & Q(declared_license_expression_spdx="") | Q(declared_license_expression__isnull=False) & Q(declared_license_expression_spdx__isnull=True) ) + package_count = packages.count() + chunk_size = 2000 + iterator = packages.iterator(chunk_size=chunk_size) + updated = [] + for i, package in enumerate(iterator): + if (not i % chunk_size) and updated: + Package.objects.bulk_update( + objs=updated, + fields=[ + 'declared_license_expression_spdx', + ] + ) + updated = [] + print(f" {i:,} / {package_count:,} computed and updated") + try: + if spdx := build_spdx_license_expression(package.declared_license_expression): + package.declared_license_expression_spdx = spdx + updated.append(package) + except InvalidLicenseKeyError as e: + package_url = PackageURL( + type=package.type, + namespace=package.namespace, + name=package.name, + version=package.version, + qualifiers=package.qualifiers, + subpath=package.subpath + ) + print(f" Error processing {package_url}: {e}") - for package in packages: - if spdx := build_spdx_license_expression(package.declared_license_expression): - package.declared_license_expression_spdx = spdx - package.save() + if updated: + print("Updating remaining Packages...") + Package.objects.bulk_update( + objs=updated, + fields=[ + 'declared_license_expression_spdx', + ] + ) class Migration(migrations.Migration): diff --git a/packagedb/migrations/0062_compute_resource_license_data.py b/packagedb/migrations/0062_compute_resource_license_data.py index 0cef20fc..f1b996c8 100644 --- a/packagedb/migrations/0062_compute_resource_license_data.py +++ b/packagedb/migrations/0062_compute_resource_license_data.py @@ -13,13 +13,13 @@ def compute_resource_detected_license_expression(apps, schema_editor): From scancode.io """ from license_expression import combine_expressions - from licensedcode.cache import build_spdx_license_expression + from licensedcode.cache import build_spdx_license_expression, InvalidLicenseKeyError if settings.IS_TESTS: return Resource = apps.get_model("packagedb", "Resource") - resources = Resource.objects.filter(~Q(license_expressions=[]) | Q(license_expressions__isnull=False)).only('license_expressions') + resources = Resource.objects.filter(~Q(license_expressions=[])).filter(license_expressions__is_null=False) object_count = resources.count() print(f"\nCompute detected_license_expression for {object_count:,} resources.") @@ -29,7 +29,11 @@ def compute_resource_detected_license_expression(apps, schema_editor): unsaved_objects = [] for index, resource in enumerate(iterator, start=1): - combined_expression = str(combine_expressions(resource.license_expressions)) + combined_expression = combine_expressions(resource.license_expressions) + if not combined_expression: + print(f' invalid license expression for {resource.path}: {combined_expression}') + continue + combined_expression = str(combined_expression) # gpl-2.0 OR broadcom-linking-unmodified OR proprietary-license # build_spdx_license_expression("broadcom-linking-unmodified") # AttributeError: 'LicenseSymbol' object has no attribute 'wrapped' @@ -122,7 +126,7 @@ def compute_resource_license_detections(apps, schema_editor): From scancode.io """ Resource = apps.get_model("packagedb", "Resource") - resources = Resource.objects.filter(~Q(licenses=[]) | Q(licenses__isnull=False)).only('licenses') + resources = Resource.objects.filter(~Q(licenses=[])).filter(licenses__is_null=False) object_count = resources.count() print(f"\nCompute license_detections for {object_count:,} resources.") diff --git a/packagedb/migrations/0070_auto_20230706_0045.py b/packagedb/migrations/0070_auto_20230706_0045.py index 9d18cbdd..d9fa116a 100644 --- a/packagedb/migrations/0070_auto_20230706_0045.py +++ b/packagedb/migrations/0070_auto_20230706_0045.py @@ -66,8 +66,11 @@ def create_maven_package_sets(apps, schema_editor): "version", "qualifiers", "subpath", - ).iterator( - chunk_size=5000 + ) + package_count = maven_packages_without_package_set.count() + chunk_size = 2000 + iterator = maven_packages_without_package_set.iterator( + chunk_size=chunk_size ) prev_namespace = None @@ -75,7 +78,17 @@ def create_maven_package_sets(apps, schema_editor): prev_version = None prev_package = None unupdated_packages = [] - for package in maven_packages_without_package_set: + for i, package in enumerate(iterator): + if not (i % chunk_size) and unupdated_packages: + Package.objects.bulk_update( + objs=unupdated_packages, + fields=[ + "package_content", + ] + ) + unupdated_packages = [] + print(f" {i:,} / {package_count:,} updated") + if "source" in package.qualifiers: package_content = PackageContentType.SOURCE_ARCHIVE else: From defa278fa452e010f76766021c6aa501cf560cb8 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Wed, 11 Oct 2023 14:00:07 -0700 Subject: [PATCH 16/19] Update expected test results Signed-off-by: Jono Yang --- .../end2end/expected_mapped_packages.json | 38 +++++++++---------- ..._mapped_commons-jaxrs-1.21-from-index.json | 2 +- ...prockets-vendor_gems-0.1.3.gem.mapped.json | 2 +- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/minecode/tests/testfiles/maven/end2end/expected_mapped_packages.json b/minecode/tests/testfiles/maven/end2end/expected_mapped_packages.json index 5dfd490a..8fdc7fae 100644 --- a/minecode/tests/testfiles/maven/end2end/expected_mapped_packages.json +++ b/minecode/tests/testfiles/maven/end2end/expected_mapped_packages.json @@ -9,7 +9,7 @@ "package_content":null, "primary_language":null, "description":"APIs that App Engine provides to you to build your application.", - "release_date":"2009-05-21", + "release_date":"2009-05-21T00:00:00Z", "parties":[], "keywords":[], "homepage_url":null, @@ -53,7 +53,7 @@ "package_content":null, "primary_language":null, "description":null, - "release_date":"2009-05-21", + "release_date":"2009-05-21T00:00:00Z", "parties":[], "keywords":[], "homepage_url":null, @@ -97,7 +97,7 @@ "package_content":null, "primary_language":null, "description":null, - "release_date":"2009-05-21", + "release_date":"2009-05-21T00:00:00Z", "parties":[], "keywords":[], "homepage_url":null, @@ -141,7 +141,7 @@ "package_content":null, "primary_language":null, "description":"Library which allows discovering classes at runtime", - "release_date":"2009-05-21", + "release_date":"2009-05-21T00:00:00Z", "parties":[], "keywords":[], "homepage_url":null, @@ -185,7 +185,7 @@ "package_content":null, "primary_language":null, "description":"Library which allows discovering classes at runtime", - "release_date":"2009-05-21", + "release_date":"2009-05-21T00:00:00Z", "parties":[], "keywords":[], "homepage_url":null, @@ -229,7 +229,7 @@ "package_content":null, "primary_language":null, "description":"Google Collections Library is a suite of new collections and collection-related goodness for Java 5.0", - "release_date":"2009-05-21", + "release_date":"2009-05-21T00:00:00Z", "parties":[], "keywords":[], "homepage_url":null, @@ -273,7 +273,7 @@ "package_content":null, "primary_language":null, "description":"Google Collections Library is a suite of new collections and collection-related goodness for Java 5.0", - "release_date":"2009-05-21", + "release_date":"2009-05-21T00:00:00Z", "parties":[], "keywords":[], "homepage_url":null, @@ -317,7 +317,7 @@ "package_content":null, "primary_language":null, "description":"Google Collections Library is a suite of new collections and collection-related goodness for Java 5.0", - "release_date":"2009-05-21", + "release_date":"2009-05-21T00:00:00Z", "parties":[], "keywords":[], "homepage_url":null, @@ -361,7 +361,7 @@ "package_content":null, "primary_language":null, "description":"Google Collections Library is a suite of new collections and collection-related goodness for Java 5.0", - "release_date":"2009-05-21", + "release_date":"2009-05-21T00:00:00Z", "parties":[], "keywords":[], "homepage_url":null, @@ -405,7 +405,7 @@ "package_content":null, "primary_language":null, "description":"Google Collections Library is a suite of new collections and collection-related goodness for Java 5.0", - "release_date":"2009-05-21", + "release_date":"2009-05-21T00:00:00Z", "parties":[], "keywords":[], "homepage_url":null, @@ -449,7 +449,7 @@ "package_content":null, "primary_language":null, "description":"Google Collections Library is a suite of new collections and collection-related goodness for Java 5.0", - "release_date":"2009-05-21", + "release_date":"2009-05-21T00:00:00Z", "parties":[], "keywords":[], "homepage_url":null, @@ -493,7 +493,7 @@ "package_content":null, "primary_language":null, "description":"Protocol Buffers are a way of encoding structured data in an efficient yet\n extensible format.", - "release_date":"2009-05-21", + "release_date":"2009-05-21T00:00:00Z", "parties":[], "keywords":[], "homepage_url":null, @@ -537,7 +537,7 @@ "package_content":null, "primary_language":null, "description":"Protocol Buffers are a way of encoding structured data in an efficient yet\n extensible format.", - "release_date":"2009-05-21", + "release_date":"2009-05-21T00:00:00Z", "parties":[], "keywords":[], "homepage_url":null, @@ -581,7 +581,7 @@ "package_content":null, "primary_language":null, "description":"Protocol Buffers are a way of encoding structured data in an efficient yet\n extensible format.", - "release_date":"2009-05-21", + "release_date":"2009-05-21T00:00:00Z", "parties":[], "keywords":[], "homepage_url":null, @@ -625,7 +625,7 @@ "package_content":null, "primary_language":null, "description":"Protocol Buffers are a way of encoding structured data in an efficient yet\n extensible format.", - "release_date":"2009-05-21", + "release_date":"2009-05-21T00:00:00Z", "parties":[], "keywords":[], "homepage_url":null, @@ -669,7 +669,7 @@ "package_content":null, "primary_language":null, "description":"Protocol Buffers are a way of encoding structured data in an efficient yet\n extensible format.", - "release_date":"2009-05-21", + "release_date":"2009-05-21T00:00:00Z", "parties":[], "keywords":[], "homepage_url":null, @@ -713,7 +713,7 @@ "package_content":null, "primary_language":null, "description":"Protocol Buffers are a way of encoding structured data in an efficient yet\n extensible format.", - "release_date":"2009-05-21", + "release_date":"2009-05-21T00:00:00Z", "parties":[], "keywords":[], "homepage_url":null, @@ -757,7 +757,7 @@ "package_content":null, "primary_language":null, "description":"The Social Graph Node Mapper is a community project to build a portable library to map social networking sites' URLs to and from a new canonical form (sgn:// URLs).", - "release_date":"2009-05-21", + "release_date":"2009-05-21T00:00:00Z", "parties":[], "keywords":[], "homepage_url":null, @@ -801,7 +801,7 @@ "package_content":null, "primary_language":null, "description":"The Social Graph Node Mapper is a community project to build a portable library to map social networking sites' URLs to and from a new canonical form (sgn:// URLs).", - "release_date":"2009-05-21", + "release_date":"2009-05-21T00:00:00Z", "parties":[], "keywords":[], "homepage_url":null, diff --git a/minecode/tests/testfiles/maven/end2end_multisteps/expected_mapped_commons-jaxrs-1.21-from-index.json b/minecode/tests/testfiles/maven/end2end_multisteps/expected_mapped_commons-jaxrs-1.21-from-index.json index bc9d0ae4..a31a465f 100644 --- a/minecode/tests/testfiles/maven/end2end_multisteps/expected_mapped_commons-jaxrs-1.21-from-index.json +++ b/minecode/tests/testfiles/maven/end2end_multisteps/expected_mapped_commons-jaxrs-1.21-from-index.json @@ -9,7 +9,7 @@ "package_content":null, "primary_language":null, "description":"Common classes to make creating REST services more consistent.", - "release_date":"2009-05-21", + "release_date":"2009-05-21T00:00:00Z", "parties":[], "keywords":[], "homepage_url":null, diff --git a/minecode/tests/testfiles/rubygems/sprockets-vendor_gems-0.1.3.gem.mapped.json b/minecode/tests/testfiles/rubygems/sprockets-vendor_gems-0.1.3.gem.mapped.json index 3513944b..643c8892 100644 --- a/minecode/tests/testfiles/rubygems/sprockets-vendor_gems-0.1.3.gem.mapped.json +++ b/minecode/tests/testfiles/rubygems/sprockets-vendor_gems-0.1.3.gem.mapped.json @@ -9,7 +9,7 @@ "package_content":null, "primary_language":null, "description":"Get the vendored assets paths in gems.", - "release_date":"2012-08-03", + "release_date":"2012-08-03T00:00:00Z", "parties":[ { "type":null, From 5784bf84e643686671b29e0e3e86fb36210b1e55 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Wed, 11 Oct 2023 15:28:04 -0700 Subject: [PATCH 17/19] Add similarity_score to directory match results * Update tests Signed-off-by: Jono Yang --- matchcode/api.py | 11 ++++++++++- matchcode/tests/test_api.py | 4 ++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/matchcode/api.py b/matchcode/api.py index 092933e3..6294fb09 100644 --- a/matchcode/api.py +++ b/matchcode/api.py @@ -24,6 +24,7 @@ from matchcode_toolkit.fingerprinting import create_halohash_chunks from matchcode_toolkit.fingerprinting import hexstring_to_binarray from matchcode_toolkit.fingerprinting import split_fingerprint +from matchcode_toolkit.halohash import byte_hamming_distance from matchcode.models import ExactFileIndex from matchcode.models import ExactPackageArchiveIndex from matchcode.models import ApproximateDirectoryContentIndex @@ -91,6 +92,7 @@ class BaseDirectoryIndexMatchSerializer(Serializer): lookup_field='uuid', read_only=True ) + similarity_score = CharField() class CharMultipleWidget(widgets.TextInput): @@ -271,11 +273,18 @@ def match(self, request): for fingerprint in unique_fingerprints: matches = model_class.match(fingerprint) for match in matches: + _, bah128 = split_fingerprint(fingerprint) + # Get fingerprint from the match + fp = match.fingerprint() + _, match_bah128 = split_fingerprint(fp) + hd = byte_hamming_distance(bah128, match_bah128) + similarity_score = (128 - hd) / 128 results.append( { 'fingerprint': fingerprint, - 'matched_fingerprint': match.fingerprint(), + 'matched_fingerprint': fp, 'package': match.package, + 'similarity_score': similarity_score, } ) diff --git a/matchcode/tests/test_api.py b/matchcode/tests/test_api.py index be971081..93c29ca3 100644 --- a/matchcode/tests/test_api.py +++ b/matchcode/tests/test_api.py @@ -117,6 +117,7 @@ def test_api_approximate_directory_content_index_match_close_match(self): self.assertEqual(expected_matched_fingerprint, result['matched_fingerprint']) expected_package = 'http://testserver' + reverse('api:package-detail', args=[self.test_package1.uuid]) self.assertEqual(expected_package, result['package']) + self.assertEqual('0.9453125', result['similarity_score']) def test_api_approximate_directory_structure_index_match_close_match(self): # This test fingerprint has a hamming distance of 7 from the expected fingerprint @@ -133,6 +134,7 @@ def test_api_approximate_directory_structure_index_match_close_match(self): self.assertEqual(expected_matched_fingerprint, result['matched_fingerprint']) expected_package = 'http://testserver' + reverse('api:package-detail', args=[self.test_package2.uuid]) self.assertEqual(expected_package, result['package']) + self.assertEqual('0.9453125', result['similarity_score']) def test_api_approximate_directory_content_index_match(self): test_fingerprint = '00000007af7d63765c78fa516b5353f5ffa7df45' @@ -147,6 +149,7 @@ def test_api_approximate_directory_content_index_match(self): self.assertEqual(test_fingerprint, result['matched_fingerprint']) expected_package = 'http://testserver' + reverse('api:package-detail', args=[self.test_package1.uuid]) self.assertEqual(expected_package, result['package']) + self.assertEqual('1.0', result['similarity_score']) def test_api_approximate_directory_structure_index_match(self): test_fingerprint = '00000004d10982208810240820080a6a3e852486' @@ -161,3 +164,4 @@ def test_api_approximate_directory_structure_index_match(self): self.assertEqual(test_fingerprint, result['matched_fingerprint']) expected_package = 'http://testserver' + reverse('api:package-detail', args=[self.test_package2.uuid]) self.assertEqual(expected_package, result['package']) + self.assertEqual('1.0', result['similarity_score']) From b38be78f6e317f0aa22085129fa0f27d95ea8bdc Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Wed, 11 Oct 2023 16:27:37 -0700 Subject: [PATCH 18/19] Use FloatField instead of CharField Signed-off-by: Jono Yang --- matchcode/api.py | 3 ++- matchcode/tests/test_api.py | 8 ++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/matchcode/api.py b/matchcode/api.py index 6294fb09..68844b8d 100644 --- a/matchcode/api.py +++ b/matchcode/api.py @@ -15,6 +15,7 @@ from rest_framework.decorators import action from rest_framework.response import Response from rest_framework.serializers import CharField +from rest_framework.serializers import FloatField from rest_framework.serializers import HyperlinkedRelatedField from rest_framework.serializers import ModelSerializer from rest_framework.serializers import ReadOnlyField @@ -92,7 +93,7 @@ class BaseDirectoryIndexMatchSerializer(Serializer): lookup_field='uuid', read_only=True ) - similarity_score = CharField() + similarity_score = FloatField() class CharMultipleWidget(widgets.TextInput): diff --git a/matchcode/tests/test_api.py b/matchcode/tests/test_api.py index 93c29ca3..8decc568 100644 --- a/matchcode/tests/test_api.py +++ b/matchcode/tests/test_api.py @@ -117,7 +117,7 @@ def test_api_approximate_directory_content_index_match_close_match(self): self.assertEqual(expected_matched_fingerprint, result['matched_fingerprint']) expected_package = 'http://testserver' + reverse('api:package-detail', args=[self.test_package1.uuid]) self.assertEqual(expected_package, result['package']) - self.assertEqual('0.9453125', result['similarity_score']) + self.assertEqual(0.9453125, result['similarity_score']) def test_api_approximate_directory_structure_index_match_close_match(self): # This test fingerprint has a hamming distance of 7 from the expected fingerprint @@ -134,7 +134,7 @@ def test_api_approximate_directory_structure_index_match_close_match(self): self.assertEqual(expected_matched_fingerprint, result['matched_fingerprint']) expected_package = 'http://testserver' + reverse('api:package-detail', args=[self.test_package2.uuid]) self.assertEqual(expected_package, result['package']) - self.assertEqual('0.9453125', result['similarity_score']) + self.assertEqual(0.9453125, result['similarity_score']) def test_api_approximate_directory_content_index_match(self): test_fingerprint = '00000007af7d63765c78fa516b5353f5ffa7df45' @@ -149,7 +149,7 @@ def test_api_approximate_directory_content_index_match(self): self.assertEqual(test_fingerprint, result['matched_fingerprint']) expected_package = 'http://testserver' + reverse('api:package-detail', args=[self.test_package1.uuid]) self.assertEqual(expected_package, result['package']) - self.assertEqual('1.0', result['similarity_score']) + self.assertEqual(1.0, result['similarity_score']) def test_api_approximate_directory_structure_index_match(self): test_fingerprint = '00000004d10982208810240820080a6a3e852486' @@ -164,4 +164,4 @@ def test_api_approximate_directory_structure_index_match(self): self.assertEqual(test_fingerprint, result['matched_fingerprint']) expected_package = 'http://testserver' + reverse('api:package-detail', args=[self.test_package2.uuid]) self.assertEqual(expected_package, result['package']) - self.assertEqual('1.0', result['similarity_score']) + self.assertEqual(1.0, result['similarity_score']) From b8ffe30fae1a1736785ec9907940b926014037b6 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Wed, 18 Oct 2023 12:04:14 -0700 Subject: [PATCH 19/19] Update test results Signed-off-by: Jono Yang --- .../expected_mapped_commons-jaxrs-1.21-from-pom.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/minecode/tests/testfiles/maven/end2end_multisteps/expected_mapped_commons-jaxrs-1.21-from-pom.json b/minecode/tests/testfiles/maven/end2end_multisteps/expected_mapped_commons-jaxrs-1.21-from-pom.json index bc9d0ae4..a31a465f 100644 --- a/minecode/tests/testfiles/maven/end2end_multisteps/expected_mapped_commons-jaxrs-1.21-from-pom.json +++ b/minecode/tests/testfiles/maven/end2end_multisteps/expected_mapped_commons-jaxrs-1.21-from-pom.json @@ -9,7 +9,7 @@ "package_content":null, "primary_language":null, "description":"Common classes to make creating REST services more consistent.", - "release_date":"2009-05-21", + "release_date":"2009-05-21T00:00:00Z", "parties":[], "keywords":[], "homepage_url":null,