Skip to content

Commit

Permalink
Make last modified date export with the UTC offset (#2322)
Browse files Browse the repository at this point in the history
- Make the lastmod date export with the UTC offset (+00:00) (otherwise
Google Search Console complains the date format is incorrect)
- Set the lastmod date to the modified date of each entry, so crawlers
don't have to revisit every page every day.
- Add projection to avoid returning the entire bug.
  • Loading branch information
another-rex authored Jun 19, 2024
1 parent 3be5402 commit 3119dd1
Show file tree
Hide file tree
Showing 3 changed files with 115 additions and 30 deletions.
81 changes: 66 additions & 15 deletions docker/cron/generate_sitemap/generate_sitemap.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import osv.logs
import datetime
import argparse
from collections import defaultdict
from google.cloud import ndb

from xml.etree.ElementTree import Element, SubElement, ElementTree
Expand All @@ -31,18 +32,39 @@
_SITEMAP_URL_LIMIT = 49999


def fetch_vulnerability_ids(ecosystem: str) -> list[str]:
def epoch() -> datetime.datetime:
return datetime.datetime.fromtimestamp(0).astimezone(datetime.UTC)


alias_to_last_modified: defaultdict[str, datetime.datetime] = defaultdict(epoch)


def fetch_vulnerabilities_and_dates(
ecosystem: str) -> list[tuple[str, datetime.datetime]]:
"""Fetch vulnerabilities' id for the given ecosystem."""
# Query with projection to reduce data returned
# Order does not matter, other than to keep things consistent
bugs = osv.Bug.query(
osv.Bug.status == osv.BugStatus.PROCESSED,
osv.Bug.public == True, # pylint: disable=singleton-comparison
osv.Bug.ecosystem == ecosystem).order(-osv.Bug.timestamp)
bug_ids = [bug.db_id for bug in bugs]
return bug_ids
osv.Bug.ecosystem == ecosystem,
projection=[osv.Bug.last_modified]).order(-osv.Bug.last_modified)

bug_and_dates = []
for bug in bugs:
key = bug.key.id()
# Make sure to set the timezone to UTC to add +00:00 when outputting iso
last_mod_date = max(
bug.last_modified.replace(tzinfo=datetime.UTC),
alias_to_last_modified[bug.key.id()])
bug_and_dates.append((key, last_mod_date))

return bug_and_dates


def osv_get_ecosystems():
"""Get list of ecosystems."""
# This includes ecosystems with only non processed/public entries
query = osv.Bug.query(projection=[osv.Bug.ecosystem], distinct=True)
return sorted([bug.ecosystem[0] for bug in query if bug.ecosystem],
key=str.lower)
Expand All @@ -58,28 +80,42 @@ def get_sitemap_url_for_ecosystem(ecosystem: str, base_url: str) -> str:
return f'{base_url}/{_SITEMAPS_PREFIX}{ecosystem_name}.xml'


def generate_sitemap_for_ecosystem(ecosystem: str, base_url: str) -> None:
"""Generate a sitemap for the give n ecosystem."""
def generate_sitemap_for_ecosystem(ecosystem: str,
base_url: str) -> datetime.datetime:
"""
Generate a sitemap for the give n ecosystem.
Returns the latest modified date of it's entries.
"""
logging.info('Generating sitemap for ecosystem "%s".', ecosystem)
vulnerability_ids = fetch_vulnerability_ids(ecosystem)
vulnerability_and_dates = fetch_vulnerabilities_and_dates(ecosystem)
filename = get_sitemap_filename_for_ecosystem(ecosystem)
urlset = Element(
'urlset', xmlns='http://www.sitemaps.org/schemas/sitemap/0.9')

if len(vulnerability_and_dates) > _SITEMAP_URL_LIMIT:
logging.warning('Ecosystem "%s" Exceeded sitemap size limit', ecosystem)

# TODO: For large ecosystems with over 50,000 vulnerabilities, generate
# multiple sitemaps.
for vuln in vulnerability_ids[:_SITEMAP_URL_LIMIT]:
for vuln_id, last_modified in vulnerability_and_dates[:_SITEMAP_URL_LIMIT]:
url = SubElement(urlset, 'url')
loc = SubElement(url, 'loc')
loc.text = f'{base_url}/vulnerability/{vuln}'
loc.text = f'{base_url}/vulnerability/{vuln_id}'
lastmod = SubElement(url, 'lastmod')
lastmod.text = datetime.datetime.now().isoformat()
lastmod.text = last_modified.isoformat()

tree = ElementTree(urlset)
tree.write(filename, encoding='utf-8', xml_declaration=True)

# Addition of epoch for edge cases where vulnerability is empty
return max([
last_mod for _, last_mod in vulnerability_and_dates[:_SITEMAP_URL_LIMIT]
] + [epoch()])


def generate_sitemap_index(ecosystems: set[str], base_url: str) -> None:
def generate_sitemap_index(ecosystems: set[str], base_url: str,
last_mod_dict: dict[str, datetime.datetime]) -> None:
"""Generate a sitemap index."""
logging.info('Generating sitemap index.')
sitemapindex = Element(
Expand All @@ -90,24 +126,38 @@ def generate_sitemap_index(ecosystems: set[str], base_url: str) -> None:
loc = SubElement(sitemap, 'loc')
loc.text = get_sitemap_url_for_ecosystem(ecosystem, base_url)
lastmod = SubElement(sitemap, 'lastmod')
lastmod.text = datetime.datetime.now().isoformat()
lastmod.text = last_mod_dict[ecosystem].isoformat()

tree = ElementTree(sitemapindex)
tree.write(_SITEMAP_INDEX_PATH, encoding='utf-8', xml_declaration=True)


def generate_sitemaps(base_url: str) -> None:
"""Generate sitemaps including all vulnerabilities, split by ecosystem."""

logging.info("Begin generating sitemaps.")
# Go over the base ecosystems index. Otherwise we'll have duplicated
# vulnerabilities in the sitemap.
base_ecosystems = {
ecosystem for ecosystem in osv_get_ecosystems() if ':' not in ecosystem
}

ecosystem_last_mod_dates = {}
for ecosystem in base_ecosystems:
generate_sitemap_for_ecosystem(ecosystem, base_url)
ecosystem_last_mod_dates[ecosystem] = generate_sitemap_for_ecosystem(
ecosystem, base_url)

generate_sitemap_index(base_ecosystems, base_url, ecosystem_last_mod_dates)


generate_sitemap_index(base_ecosystems, base_url)
def preload_alias_groups():
"""Fetch all alias groups, as we will be querying all of them anyway"""
logging.info("Preloading alias groups into memory.")
aliases = osv.AliasGroup.query()
for al in aliases:
al: osv.AliasGroup
for bug_id in al.bug_ids: # type: ignore
alias_to_last_modified[bug_id] = al.last_modified.replace( # type: ignore
tzinfo=datetime.UTC)


def main() -> int:
Expand All @@ -121,6 +171,7 @@ def main() -> int:
os.makedirs(_OUTPUT_DIRECTORY, exist_ok=True)
os.chdir(_OUTPUT_DIRECTORY)

preload_alias_groups()
generate_sitemaps(args.base_url)
return 0

Expand Down
55 changes: 40 additions & 15 deletions docker/cron/generate_sitemap/generate_sitemap_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
from unittest.mock import patch, MagicMock
import generate_sitemap
import osv
from datetime import datetime
from datetime import UTC


class TestSitemapGeneration(unittest.TestCase):
Expand All @@ -31,16 +33,22 @@ def temp_file(self):
return self.test_file.name

@patch.object(osv.Bug, 'query')
def test_fetch_vulnerability_ids(self, mock_query):
def test_fetch_vulnerabilities_and_dates(self, mock_query):
"""Test it returns the vulnerability ids for ecosystem"""
# Mock the returned query
mock_query.return_value.order.return_value = [
MagicMock(db_id='vuln1'),
MagicMock(db_id='vuln2')
MagicMock(
last_modified=datetime.fromtimestamp(10),
key=MagicMock(id=MagicMock(return_value='vuln1'))),
MagicMock(
last_modified=datetime.fromtimestamp(20),
key=MagicMock(id=MagicMock(return_value='vuln2'))),
]

result = generate_sitemap.fetch_vulnerability_ids('Go')
self.assertEqual(result, ['vuln1', 'vuln2'])
result = generate_sitemap.fetch_vulnerabilities_and_dates('Go')
self.assertEqual(
result, [('vuln1', datetime.fromtimestamp(10).replace(tzinfo=UTC)),
('vuln2', datetime.fromtimestamp(20).replace(tzinfo=UTC))])

@patch.object(osv.Bug, 'query')
def test_osv_get_ecosystems(self, mock_query):
Expand All @@ -54,12 +62,15 @@ def test_osv_get_ecosystems(self, mock_query):
result = generate_sitemap.osv_get_ecosystems()
self.assertEqual(result, ['Go', 'UVI'])

@patch('generate_sitemap.fetch_vulnerability_ids')
@patch('generate_sitemap.fetch_vulnerabilities_and_dates')
@patch('generate_sitemap.ElementTree')
def test_generate_sitemap_for_ecosystem(self, mock_element_tree,
mock_fetch_vulns):
"""Check it generates the sitemap for ecosystem"""
mock_fetch_vulns.return_value = ['vuln1', 'vuln2']
mock_fetch_vulns.return_value = [
('vuln1', datetime.fromtimestamp(10).replace(tzinfo=UTC)),
('vuln2', datetime.fromtimestamp(20).replace(tzinfo=UTC))
]
mock_tree = MagicMock()
mock_element_tree.return_value = mock_tree

Expand All @@ -68,15 +79,18 @@ def test_generate_sitemap_for_ecosystem(self, mock_element_tree,
mock_tree.write.assert_called_once_with(
'./sitemap_Go.xml', encoding='utf-8', xml_declaration=True)

@patch('generate_sitemap.fetch_vulnerability_ids')
@patch('generate_sitemap.fetch_vulnerabilities_and_dates')
@patch('generate_sitemap.ElementTree')
def test_generate_sitemap_for_ecosystem_with_space(self, mock_element_tree,
mock_fetch_vulns):
""""
Check it creates the sitemap correctly where there is a space in the
ecosystem name.
"""
mock_fetch_vulns.return_value = ['vuln1', 'vuln2']
mock_fetch_vulns.return_value = [
('vuln1', datetime.fromtimestamp(10).replace(tzinfo=UTC)),
('vuln2', datetime.fromtimestamp(20).replace(tzinfo=UTC))
]
mock_tree = MagicMock()
mock_element_tree.return_value = mock_tree

Expand All @@ -86,15 +100,18 @@ def test_generate_sitemap_for_ecosystem_with_space(self, mock_element_tree,
mock_tree.write.assert_called_once_with(
'./sitemap_Rocky_Linux.xml', encoding='utf-8', xml_declaration=True)

@patch('generate_sitemap.fetch_vulnerability_ids')
@patch('generate_sitemap.fetch_vulnerabilities_and_dates')
@patch('generate_sitemap.ElementTree')
def test_generate_sitemap_for_ecosystem_with_period(self, mock_element_tree,
mock_fetch_vulns):
""""
Check it creates the sitemap correctly where there is a period in the
ecosystem name.
"""
mock_fetch_vulns.return_value = ['vuln1', 'vuln2']
mock_fetch_vulns.return_value = [
('vuln1', datetime.fromtimestamp(10).replace(tzinfo=UTC)),
('vuln2', datetime.fromtimestamp(20).replace(tzinfo=UTC))
]
mock_tree = MagicMock()
mock_element_tree.return_value = mock_tree

Expand All @@ -110,7 +127,11 @@ def test_generate_sitemap_index(self, mock_element_tree):
mock_tree = MagicMock()
mock_element_tree.return_value = mock_tree

generate_sitemap.generate_sitemap_index({'Go', 'UVI'}, 'http://example.com')
generate_sitemap.generate_sitemap_index(
{'Go', 'UVI'}, 'http://example.com', {
'Go': datetime.fromtimestamp(10).replace(tzinfo=UTC),
'UVI': datetime.fromtimestamp(20).replace(tzinfo=UTC)
})

mock_tree.write.assert_called_once_with(
'./sitemap_index.xml', encoding='utf-8', xml_declaration=True)
Expand All @@ -125,15 +146,19 @@ def test_generate_sitemap(self, mock_get_ecosystems, mock_generate_index,
sitemap index.
"""
mock_get_ecosystems.return_value = ['Go', 'UVI:Library', 'Android']

mock_generate_sitemap.return_value = datetime.fromtimestamp(10).replace(
tzinfo=UTC)
generate_sitemap.generate_sitemaps('http://example.com')

self.assertEqual(mock_generate_sitemap.call_count, 2)
mock_generate_sitemap.assert_any_call('Go', 'http://example.com')
mock_generate_sitemap.assert_any_call('Android', 'http://example.com')

mock_generate_index.assert_called_once_with({'Android', 'Go'},
'http://example.com')
mock_generate_index.assert_called_once_with(
{'Android', 'Go'}, 'http://example.com', {
'Android': datetime.fromtimestamp(10).replace(tzinfo=UTC),
'Go': datetime.fromtimestamp(10).replace(tzinfo=UTC),
})


if __name__ == '__main__':
Expand Down
9 changes: 9 additions & 0 deletions gcp/appengine/index.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,15 @@ indexes:
direction: desc

# ecosystem + search_indices queries is served by merging the two indexes above

# To allow sitemap generation to project and sort on last_modified
- kind: Bug
properties:
- name: ecosystem
- name: public
- name: status
- name: last_modified
direction: desc

# TODO: See if this can be removed
- kind: Bug
Expand Down

0 comments on commit 3119dd1

Please sign in to comment.