Skip to content

Commit ca7b952

Browse files
committed
Synced webpages from sitemap to Document model for search.
1 parent 72b3840 commit ca7b952

File tree

8 files changed

+299
-12
lines changed

8 files changed

+299
-12
lines changed

docs/management/commands/update_docs.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,8 @@ def build_doc_release(self, release, force=False, interactive=False):
132132
if self.verbosity >= 1:
133133
self.stdout.write(f"Starting update for {release} at {datetime.now()}...")
134134

135+
release.sync_from_sitemap()
136+
135137
# checkout_dir is shared for all languages.
136138
checkout_dir = settings.DOCS_BUILD_ROOT.joinpath("sources", release.version)
137139
parent_build_dir = settings.DOCS_BUILD_ROOT.joinpath(

docs/models.py

Lines changed: 63 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from functools import partial, reduce
66
from pathlib import Path
77

8+
import requests
89
from django.conf import settings
910
from django.contrib.postgres.indexes import GinIndex
1011
from django.contrib.postgres.search import (
@@ -34,8 +35,16 @@
3435
START_SEL,
3536
STOP_SEL,
3637
TSEARCH_CONFIG_LANGUAGES,
38+
DocumentationCategory,
39+
fetch_html,
3740
get_document_search_vector,
3841
)
42+
from .utils import extract_inner_html
43+
44+
45+
def get_search_config(lang):
46+
"""Determine the PostgreSQL search language"""
47+
return TSEARCH_CONFIG_LANGUAGES.get(lang[:2], DEFAULT_TEXT_SEARCH_CONFIG)
3948

4049

4150
class DocumentReleaseQuerySet(models.QuerySet):
@@ -175,7 +184,7 @@ def sync_to_db(self, decoded_documents):
175184
the database. Deletes all the release's documents first then
176185
reinserts them as needed.
177186
"""
178-
self.documents.all().delete()
187+
self.documents.exclude(metadata__parents=DocumentationCategory.WEBSITE).delete()
179188

180189
# Read excluded paths from robots.docs.txt.
181190
robots_path = settings.BASE_DIR.joinpath(
@@ -206,16 +215,54 @@ def sync_to_db(self, decoded_documents):
206215
path=document_path,
207216
title=html.unescape(strip_tags(document["title"])),
208217
metadata=document,
209-
config=TSEARCH_CONFIG_LANGUAGES.get(
210-
self.lang[:2], DEFAULT_TEXT_SEARCH_CONFIG
211-
),
218+
config=get_search_config(self.lang),
212219
)
213-
for document in self.documents.all():
220+
for document in self.documents.exclude(
221+
metadata__parents=DocumentationCategory.WEBSITE
222+
):
214223
document.metadata["breadcrumbs"] = list(
215224
Document.objects.breadcrumbs(document).values("title", "path")
216225
)
217226
document.save(update_fields=("metadata",))
218227

228+
def sync_from_sitemap(self):
229+
from djangoproject.urls.www import sitemaps
230+
231+
if self.lang != "en" or self.release:
232+
return
233+
234+
for sitemap in sitemaps.values():
235+
for url in sitemap().get_urls():
236+
try:
237+
page_html = fetch_html(url["location"])
238+
except requests.RequestException:
239+
continue
240+
try:
241+
main_html = extract_inner_html(page_html, tag="main")
242+
title = extract_inner_html(page_html, tag="h1")
243+
except ValueError:
244+
continue
245+
Document.objects.update_or_create(
246+
release=self,
247+
path=url["location"],
248+
defaults={
249+
"title": title,
250+
"metadata": {
251+
"body": main_html,
252+
"breadcrumbs": [
253+
{
254+
"path": DocumentationCategory.WEBSITE,
255+
"title": "Website",
256+
},
257+
],
258+
"parents": DocumentationCategory.WEBSITE,
259+
"title": title,
260+
"toc": "",
261+
},
262+
"config": get_search_config(self.lang),
263+
},
264+
)
265+
219266

220267
def _clean_document_path(path):
221268
# We have to be a bit careful to reverse-engineer the correct
@@ -228,7 +275,9 @@ def _clean_document_path(path):
228275

229276

230277
def document_url(doc):
231-
if doc.path:
278+
if doc.metadata.get("parents") == DocumentationCategory.WEBSITE:
279+
return doc.path
280+
elif doc.path:
232281
kwargs = {
233282
"lang": doc.release.lang,
234283
"version": doc.release.version,
@@ -273,6 +322,14 @@ def search(self, query_text, release, document_category=None):
273322
config=models.F("config"),
274323
)
275324
base_filter = Q(release_id=release.id)
325+
if release.lang == "en" and release.version != "dev":
326+
dev_release = DocumentRelease.objects.get_by_version_and_lang(
327+
"dev", "en"
328+
)
329+
base_filter |= Q(
330+
release_id=dev_release.id,
331+
metadata__parents=DocumentationCategory.WEBSITE,
332+
)
276333
if document_category:
277334
base_filter &= Q(metadata__parents__startswith=document_category)
278335
base_qs = (

docs/search.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import requests
12
from django.contrib.postgres.search import SearchVector
23
from django.db.models import TextChoices
34
from django.db.models.fields.json import KeyTextTransform
@@ -67,10 +68,41 @@ class DocumentationCategory(TextChoices):
6768
TOPICS = "topics", _("Using Django")
6869
HOWTO = "howto", _("How-to guides")
6970
RELEASE_NOTES = "releases", _("Release notes")
71+
WEBSITE = "website", _("Django Website")
7072

7173
@classmethod
7274
def parse(cls, value, default=None):
7375
try:
7476
return cls(value)
7577
except ValueError:
7678
return None
79+
80+
81+
def fetch_html(url, timeout=10):
82+
"""
83+
Fetch the HTML of a page if status code is 200.
84+
Simulates a human browser and accepts only text/html.
85+
"""
86+
87+
headers = {
88+
"User-Agent": (
89+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
90+
"AppleWebKit/537.36 (KHTML, like Gecko) "
91+
"Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"
92+
),
93+
"Accept": "text/html",
94+
"Accept-Language": "en-US,en;q=0.9",
95+
}
96+
97+
response = requests.get(url, headers=headers, timeout=timeout)
98+
99+
if response.status_code == 200:
100+
content_type = response.headers.get("Content-Type", "")
101+
if "text/html" in content_type:
102+
return response.text
103+
else:
104+
raise requests.RequestException(f"Unexpected Content-Type: {content_type}")
105+
else:
106+
raise requests.RequestException(
107+
f"Failed to fetch {url}, status code: {response.status_code}"
108+
)

docs/templates/docs/search_results.html

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,11 +43,11 @@ <h2>{% translate "No search query given" %}</h2>
4343
{% for result in page.object_list %}
4444
<dt>
4545
<h2 class="result-title">
46-
<a href="{% url 'document-detail' lang=result.release.lang version=result.release.version url=result.path host 'docs' %}{% if not start_sel in result.headline %}{{ result.highlight|fragment }}{% endif %}">{{ result.headline|safe }}</a>
46+
<a href="{{ result.get_absolute_url }}{% if not start_sel in result.headline %}{{ result.highlight|fragment }}{% endif %}">{{ result.headline|safe }}</a>
4747
</h2>
4848
<span class="meta breadcrumbs">
4949
{% for breadcrumb in result.breadcrumbs %}
50-
<a href="{% url 'document-detail' lang=result.release.lang version=result.release.version url=breadcrumb.path host 'docs' %}">{{ breadcrumb.title }}</a>{% if not forloop.last %} <span class="arrow">»</span>{% endif %}
50+
<a href="{{ result.get_absolute_url }}">{{ breadcrumb.title }}</a>{% if not forloop.last %} <span class="arrow">»</span>{% endif %}
5151
{% endfor %}
5252
</span>
5353
</dt>
@@ -60,7 +60,7 @@ <h2 class="result-title">
6060
<ul class="code-links">
6161
{% for name, value in result_code_links.items %}
6262
<li>
63-
<a href="{% url 'document-detail' lang=result.release.lang version=result.release.version url=result.path host 'docs' %}#{{ value.full_path }}">
63+
<a href="{{ result.get_absolute_url }}#{{ value.full_path }}">
6464
<div>
6565
<code>{{ name }}</code>
6666
{% if value.module_path %}<div class="meta">{{ value.module_path }}</div>{% endif %}

docs/tests/test_models.py

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,17 @@
11
import datetime
22
from operator import attrgetter
33

4+
import requests_mock
45
from django.conf import settings
56
from django.db import connection
67
from django.test import TestCase
8+
from django.utils import timezone
79

10+
from blog.models import ContentFormat, Entry
811
from releases.models import Release
912

1013
from ..models import Document, DocumentRelease
14+
from ..search import DocumentationCategory
1115

1216

1317
class ModelsTests(TestCase):
@@ -173,6 +177,7 @@ def test_get_available_languages_by_version(self):
173177
class DocumentManagerTest(TestCase):
174178
@classmethod
175179
def setUpTestData(cls):
180+
cls.dev_release = DocumentRelease.objects.create(lang="en")
176181
cls.release = DocumentRelease.objects.create(
177182
release=Release.objects.create(version="1.2.3"),
178183
)
@@ -347,6 +352,20 @@ def setUpTestData(cls):
347352
"release": cls.release_fr,
348353
"title": "Notes de publication de Django 1.9.4",
349354
},
355+
{
356+
"metadata": {
357+
"body": "Main 1",
358+
"breadcrumbs": [
359+
{"path": DocumentationCategory.WEBSITE, "title": "Website"}
360+
],
361+
"parents": DocumentationCategory.WEBSITE,
362+
"title": "Title 1",
363+
"toc": "",
364+
},
365+
"path": "example",
366+
"release": cls.dev_release,
367+
"title": "Blog post",
368+
},
350369
]
351370
Document.objects.bulk_create(Document(**doc) for doc in documents)
352371

@@ -446,6 +465,16 @@ def test_search_title(self):
446465
),
447466
)
448467

468+
def test_website_document_items_included_english(self):
469+
self.assertQuerySetEqual(
470+
Document.objects.search("Main", self.release),
471+
["Blog post"],
472+
transform=attrgetter("title"),
473+
)
474+
475+
def test_website_document_items_excluded_non_english(self):
476+
self.assertEqual(Document.objects.search("Main", self.release_fr).count(), 0)
477+
449478

450479
class UpdateDocTests(TestCase):
451480
@classmethod
@@ -547,3 +576,77 @@ def test_excluded_documents(self):
547576
)
548577
document = release.documents.get()
549578
self.assertEqual(document.path, "nonexcluded/bar")
579+
580+
def test_sync_to_db_not_delete_website_docs(self):
581+
Document.objects.create(
582+
release=self.release,
583+
path="example_path",
584+
title="Title 1",
585+
metadata={
586+
"body": "Main 1",
587+
"breadcrumbs": [
588+
{"path": DocumentationCategory.WEBSITE, "title": "Website"}
589+
],
590+
"parents": DocumentationCategory.WEBSITE,
591+
"title": "Title 1",
592+
"toc": "",
593+
},
594+
)
595+
self.release.sync_to_db([])
596+
self.assertEqual(Document.objects.filter(release=self.release).count(), 1)
597+
598+
def test_sync_from_sitemap_skip_non_en_dev_release(self):
599+
release = Release.objects.create(version="5.2")
600+
Entry.objects.create(
601+
pub_date=timezone.now() - datetime.timedelta(days=2),
602+
slug="a",
603+
body="<strong>test</strong>",
604+
content_format=ContentFormat.HTML,
605+
is_active=True,
606+
)
607+
for lang, release_obj in [("fr", None), ("fr", release), ("en", release)]:
608+
doc_release = DocumentRelease.objects.create(
609+
lang=lang,
610+
release=release_obj,
611+
)
612+
with self.subTest(lang=lang, release=release_obj):
613+
doc_release.sync_from_sitemap()
614+
self.assertFalse(Document.objects.exists())
615+
616+
@requests_mock.mock()
617+
def test_sync_from_sitemap(self, mocker):
618+
blog_entry = Entry.objects.create(
619+
pub_date=timezone.now() - datetime.timedelta(days=2),
620+
slug="a",
621+
body="<strong>test</strong>",
622+
content_format=ContentFormat.HTML,
623+
is_active=True,
624+
)
625+
mocker.get(
626+
blog_entry.get_absolute_url(),
627+
text="<html><main>Main 1</main><h1>Title 1</h1></html>",
628+
headers={"Content-Type": "text/html"},
629+
)
630+
self.release.sync_from_sitemap()
631+
632+
document = Document.objects.get(release=self.release)
633+
self.assertEqual(
634+
document.path,
635+
blog_entry.get_absolute_url(),
636+
)
637+
self.assertEqual(
638+
document.title,
639+
"Title 1",
640+
)
641+
self.assertEqual(
642+
document.metadata,
643+
{
644+
"body": "Main 1",
645+
"breadcrumbs": [
646+
{"path": DocumentationCategory.WEBSITE, "title": "Website"}
647+
],
648+
"parents": DocumentationCategory.WEBSITE,
649+
"title": "Title 1",
650+
"toc": "",
651+
},
652+
)

docs/tests/test_utils.py

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
from django.test import SimpleTestCase
55

6-
from ..utils import get_doc_path, sanitize_for_trigram
6+
from ..utils import extract_inner_html, get_doc_path, sanitize_for_trigram
77

88

99
class TestUtils(SimpleTestCase):
@@ -38,3 +38,39 @@ def test_sanitize_for_trigram(self):
3838
]:
3939
with self.subTest(query=query):
4040
self.assertEqual(sanitize_for_trigram(query), sanitized_query)
41+
42+
def test_extract_inner_html(self):
43+
for html, expected_output in [
44+
("<main><p>Hello</p></main>", "<p>Hello</p>"),
45+
(
46+
'<header>Test</header><main id="app" class="container">'
47+
"<h1>Title</h1></main>",
48+
"<h1>Title</h1>",
49+
),
50+
("<main>&amp; &lt; &gt; &#169;</main>", "& < > ©"),
51+
("<main></main>", ""),
52+
("<main>Hello world</main>", "Hello world"),
53+
("<main><h1>Hi</h1>Text<p>Bye</p></main>", "<h1>Hi</h1>Text<p>Bye</p>"),
54+
]:
55+
with self.subTest(html=html):
56+
self.assertEqual(extract_inner_html(html, tag="main"), expected_output)
57+
58+
def test_extract_inner_html_multiple_same_tags_raises(self):
59+
with self.assertRaisesMessage(
60+
ValueError, "<main> occurs more than once in HTML."
61+
):
62+
extract_inner_html(
63+
"<main>One main</main><main id='dupe'>Two main</main>", tag="main"
64+
)
65+
66+
def test_extract_inner_html_multiple_same_tags_nested_raises(self):
67+
with self.assertRaisesMessage(
68+
ValueError, "Nested <main> tags are not allowed."
69+
):
70+
extract_inner_html(
71+
"<main>One main<main id='dupe'>Two main</main></main>", tag="main"
72+
)
73+
74+
def test_extract_inner_html_tag_not_found_raises(self):
75+
with self.assertRaisesMessage(ValueError, "<main> not found in HTML."):
76+
extract_inner_html("<p>Test</p>", tag="main")

0 commit comments

Comments
 (0)