Skip to content

Commit

Permalink
Merge pull request #192 from nexB/179-focused-maven-visitor
Browse files Browse the repository at this point in the history
179 focused maven visitor
  • Loading branch information
JonoYang authored Oct 18, 2023
2 parents f9af26d + b8ffe30 commit ad37ccc
Show file tree
Hide file tree
Showing 20 changed files with 1,398 additions and 53 deletions.
12 changes: 11 additions & 1 deletion matchcode/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from rest_framework.decorators import action
from rest_framework.response import Response
from rest_framework.serializers import CharField
from rest_framework.serializers import FloatField
from rest_framework.serializers import HyperlinkedRelatedField
from rest_framework.serializers import ModelSerializer
from rest_framework.serializers import ReadOnlyField
Expand All @@ -24,6 +25,7 @@
from matchcode_toolkit.fingerprinting import create_halohash_chunks
from matchcode_toolkit.fingerprinting import hexstring_to_binarray
from matchcode_toolkit.fingerprinting import split_fingerprint
from matchcode_toolkit.halohash import byte_hamming_distance
from matchcode.models import ExactFileIndex
from matchcode.models import ExactPackageArchiveIndex
from matchcode.models import ApproximateDirectoryContentIndex
Expand Down Expand Up @@ -91,6 +93,7 @@ class BaseDirectoryIndexMatchSerializer(Serializer):
lookup_field='uuid',
read_only=True
)
similarity_score = FloatField()


class CharMultipleWidget(widgets.TextInput):
Expand Down Expand Up @@ -271,11 +274,18 @@ def match(self, request):
for fingerprint in unique_fingerprints:
matches = model_class.match(fingerprint)
for match in matches:
_, bah128 = split_fingerprint(fingerprint)
# Get fingerprint from the match
fp = match.fingerprint()
_, match_bah128 = split_fingerprint(fp)
hd = byte_hamming_distance(bah128, match_bah128)
similarity_score = (128 - hd) / 128
results.append(
{
'fingerprint': fingerprint,
'matched_fingerprint': match.fingerprint(),
'matched_fingerprint': fp,
'package': match.package,
'similarity_score': similarity_score,
}
)

Expand Down
4 changes: 4 additions & 0 deletions matchcode/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ def test_api_approximate_directory_content_index_match_close_match(self):
self.assertEqual(expected_matched_fingerprint, result['matched_fingerprint'])
expected_package = 'http://testserver' + reverse('api:package-detail', args=[self.test_package1.uuid])
self.assertEqual(expected_package, result['package'])
self.assertEqual(0.9453125, result['similarity_score'])

def test_api_approximate_directory_structure_index_match_close_match(self):
# This test fingerprint has a hamming distance of 7 from the expected fingerprint
Expand All @@ -133,6 +134,7 @@ def test_api_approximate_directory_structure_index_match_close_match(self):
self.assertEqual(expected_matched_fingerprint, result['matched_fingerprint'])
expected_package = 'http://testserver' + reverse('api:package-detail', args=[self.test_package2.uuid])
self.assertEqual(expected_package, result['package'])
self.assertEqual(0.9453125, result['similarity_score'])

def test_api_approximate_directory_content_index_match(self):
test_fingerprint = '00000007af7d63765c78fa516b5353f5ffa7df45'
Expand All @@ -147,6 +149,7 @@ def test_api_approximate_directory_content_index_match(self):
self.assertEqual(test_fingerprint, result['matched_fingerprint'])
expected_package = 'http://testserver' + reverse('api:package-detail', args=[self.test_package1.uuid])
self.assertEqual(expected_package, result['package'])
self.assertEqual(1.0, result['similarity_score'])

def test_api_approximate_directory_structure_index_match(self):
test_fingerprint = '00000004d10982208810240820080a6a3e852486'
Expand All @@ -161,3 +164,4 @@ def test_api_approximate_directory_structure_index_match(self):
self.assertEqual(test_fingerprint, result['matched_fingerprint'])
expected_package = 'http://testserver' + reverse('api:package-detail', args=[self.test_package2.uuid])
self.assertEqual(expected_package, result['package'])
self.assertEqual(1.0, result['similarity_score'])
75 changes: 75 additions & 0 deletions minecode/management/commands/get_maven_release_dates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# purldb is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/nexB/purldb for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#

from dateutil.parser import parse as dateutil_parse
from os.path import dirname
import logging
import sys

import requests

from minecode.management.commands import VerboseCommand
from minecode.visitors.maven import collect_links_from_text
from minecode.visitors.maven import filter_for_artifacts
from packagedb.models import Package


logger = logging.getLogger(__name__)
logging.basicConfig(stream=sys.stdout)
logger.setLevel(logging.INFO)

TRACE = False
if TRACE:
logger.setLevel(logging.DEBUG)


class Command(VerboseCommand):
help = 'Get and set release_date for Maven Packages'

def handle(self, *args, **options):
queryset = Package.objects.filter(
type='maven',
release_date=None,
download_url__startswith='https://repo1.maven.org/maven2'
)
object_count = queryset.count()
chunk_size = 2000
iterator = queryset.iterator(chunk_size=chunk_size)
unsaved_objects = []

logger.info(f'Updating release_date for {object_count} packages')
for index, package in enumerate(iterator, start=1):
download_url = package.download_url
package_url = package.package_url
logger.info(f'Updating release_date for package {package_url} ({download_url})')
package_version_page_url = dirname(download_url)
filename = download_url.rsplit('/')[-1]
response = requests.get(package_version_page_url)
if response:
timestamps_by_links = collect_links_from_text(response.text, filter=filter_for_artifacts)
timestamp = timestamps_by_links.get(filename)
if not timestamp:
logger.info(f'\tCould not get release_date for package {package_url} ({download_url})')
continue
timestamp = dateutil_parse(timestamp)
package.release_date = timestamp
unsaved_objects.append(package)
logger.info(f'\t{package_url} ({download_url}) release_date has been updated to {timestamp}')
else:
logger.info(f'\t{package_url} not updated: error encountered when visiting {package_version_page_url}')
if not (index % chunk_size) and unsaved_objects:
logger.info(f'{index:,} / {object_count:,} Packages processed')

logger.info('Updating Package objects...')
updated_packages_count = Package.objects.bulk_update(
objs=unsaved_objects,
fields=['release_date'],
batch_size=1000,
)
logger.info(f'Updated {updated_packages_count} Package objects')
167 changes: 167 additions & 0 deletions minecode/management/commands/import_queue.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# purldb is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/nexB/purldb for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#

from dateutil.parser import parse as dateutil_parse
import logging
import signal
import sys
import time

import requests

from django.db import transaction
from django.utils import timezone
from packageurl import PackageURL

from minecode.management.commands import get_error_message
from minecode.management.commands import VerboseCommand
from minecode.models import ImportableURI
from minecode.visitors.maven import get_artifact_links
from minecode.visitors.maven import get_classifier_from_artifact_url
from minecode.visitors.maven import collect_links_from_text
from minecode.visitors.maven import filter_only_directories
from minecode.visitors.maven import get_artifact_sha1
from minecode.model_utils import merge_or_create_package
from packagedcode.models import PackageData
from packagedb.models import Package
from minecode.visitors.maven import determine_namespace_name_version_from_url


logger = logging.getLogger(__name__)
logging.basicConfig(stream=sys.stdout)
logger.setLevel(logging.INFO)

TRACE = False
if TRACE:
logger.setLevel(logging.DEBUG)

# sleep duration in seconds when the queue is empty
SLEEP_WHEN_EMPTY = 10

MUST_STOP = False


def stop_handler(*args, **kwargs):
"""
Signal handler to set global variable to True.
"""
global MUST_STOP
MUST_STOP = True


signal.signal(signal.SIGTERM, stop_handler)


class Command(VerboseCommand):
help = 'Run a Package request queue.'

def handle(self, *args, **options):
"""
Get the next processable PriorityResourceURI and start the
processing. Loops forever and sleeps a short while if there are
no PriorityResourceURI left to process.
"""

global MUST_STOP

sleeping = False
processed_counter = 0

while True:
if MUST_STOP:
logger.info('Graceful exit of the request queue.')
break

with transaction.atomic():
importable_uri = ImportableURI.objects.get_next_request()

if not importable_uri:
# Only log a single message when we go to sleep
if not sleeping:
sleeping = True
logger.info('No more processable request, sleeping...')

time.sleep(SLEEP_WHEN_EMPTY)
continue

sleeping = False

# process request
logger.info('Processing {}'.format(importable_uri))
try:
errors = process_request(importable_uri)
except Exception as e:
errors = 'Error: Failed to process ImportableURI: {}\n'.format(
repr(importable_uri))
errors += get_error_message(e)
finally:
if errors:
importable_uri.processing_error = errors
logger.error(errors)
importable_uri.processed_date = timezone.now()
importable_uri.wip_date = None
importable_uri.save()
processed_counter += 1

return processed_counter


def process_request(importable_uri):
uri = importable_uri.uri
uri = uri.rstrip('/')
data = importable_uri.data
if not data:
# collect data again if we don't have it
response = requests.get(uri)
if response:
data = requests.text

purl = importable_uri.package_url
if purl:
package_url = PackageURL.from_string(purl)
namespace = package_url.namespace
name = package_url.name
else:
namespace, name, _ = determine_namespace_name_version_from_url(uri)

timestamps_by_directory_links = collect_links_from_text(data, filter_only_directories)
# Go into each version directory
for directory_link in timestamps_by_directory_links.keys():
version = directory_link.rstrip('/')
version_page_url = f'{uri}/{version}'
timestamps_by_artifact_links = get_artifact_links(version_page_url)
for artifact_link, timestamp in timestamps_by_artifact_links.items():
sha1 = get_artifact_sha1(artifact_link)
classifier = get_classifier_from_artifact_url(artifact_link, version_page_url, name, version)
qualifiers = None
if classifier:
qualifiers = f'classifier={classifier}'
release_date = dateutil_parse(timestamp)
package_data = PackageData(
type='maven',
namespace=namespace,
name=name,
version=version,
qualifiers=qualifiers,
download_url=artifact_link,
sha1=sha1,
release_date=release_date,
)
package, created, merged, map_error = merge_or_create_package(
scanned_package=package_data,
visit_level=50
)
if created:
logger.info(f'Created package {package}')
if merged:
logger.info(f'Updated package {package}')
if map_error:
logger.error(f'Error encountered: {map_error}')
importable_uri.processing_error = map_error
importable_uri.save()
31 changes: 31 additions & 0 deletions minecode/management/commands/maven_crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# purldb is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/nexB/purldb for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#

import logging
import sys

from minecode.management.commands import VerboseCommand
from minecode.visitors.maven import crawl_maven_repo_from_root


logger = logging.getLogger(__name__)
logging.basicConfig(stream=sys.stdout)
logger.setLevel(logging.INFO)

TRACE = False
if TRACE:
logger.setLevel(logging.DEBUG)


class Command(VerboseCommand):
help = 'Run a Package request queue.'

def handle(self, *args, **options):
maven_root_url = 'https://repo.maven.apache.org/maven2'
crawl_maven_repo_from_root(root_url=maven_root_url)
Loading

0 comments on commit ad37ccc

Please sign in to comment.