Skip to content

Commit 8020f62

Browse files
committed
Synced webpages from sitemap to Document model for search.
1 parent a22aff4 commit 8020f62

File tree

10 files changed

+435
-15
lines changed

10 files changed

+435
-15
lines changed

djangoproject/scss/_style.scss

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2647,7 +2647,7 @@ search.filters {
26472647
position: relative;
26482648

26492649
a {
2650-
padding: 10px 20px;
2650+
padding: 10px 15px;
26512651
text-decoration: none;
26522652
border-bottom: 3px solid transparent;
26532653
transition: color 0.3s ease, border-bottom 0.3s ease;

djangoproject/templates/includes/header.html

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
<li{% if 'download' in request.path %} class="active"{% endif %}>
2626
<a href="{% url 'download' %}">Download</a>
2727
</li>
28-
<li{% if request.host.name == 'docs' %} class="active"{% endif %}>
28+
<li{% if request.host.name == 'docs' and 'search' not in request.path %} class="active"{% endif %}>
2929
<a href="{% block doc_url %}{% url 'homepage' host 'docs' %}{% endblock %}">Documentation</a>
3030
</li>
3131
<li{% if 'weblog' in request.path %} class="active"{% endif %}>

docs/management/commands/update_docs.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,8 @@ def build_doc_release(self, release, force=False, interactive=False):
134134
if self.verbosity >= 1:
135135
self.stdout.write(f"Starting update for {release} at {datetime.now()}...")
136136

137+
release.sync_from_sitemap(force=force)
138+
137139
# checkout_dir is shared for all languages.
138140
checkout_dir = settings.DOCS_BUILD_ROOT.joinpath("sources", release.version)
139141
parent_build_dir = settings.DOCS_BUILD_ROOT.joinpath(

docs/models.py

Lines changed: 75 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from functools import partial, reduce
66
from pathlib import Path
77

8+
import requests
89
from django.conf import settings
910
from django.contrib.postgres.indexes import GinIndex
1011
from django.contrib.postgres.search import (
@@ -34,8 +35,16 @@
3435
START_SEL,
3536
STOP_SEL,
3637
TSEARCH_CONFIG_LANGUAGES,
38+
DocumentationCategory,
39+
fetch_html,
3740
get_document_search_vector,
3841
)
42+
from .utils import extract_inner_html
43+
44+
45+
def get_search_config(lang):
46+
"""Determine the PostgreSQL search language"""
47+
return TSEARCH_CONFIG_LANGUAGES.get(lang[:2], DEFAULT_TEXT_SEARCH_CONFIG)
3948

4049

4150
class DocumentReleaseQuerySet(models.QuerySet):
@@ -177,7 +186,7 @@ def sync_to_db(self, decoded_documents):
177186
the database. Deletes all the release's documents first then
178187
reinserts them as needed.
179188
"""
180-
self.documents.all().delete()
189+
self.documents.exclude(metadata__parents=DocumentationCategory.WEBSITE).delete()
181190

182191
# Read excluded paths from robots.docs.txt.
183192
robots_path = settings.BASE_DIR.joinpath(
@@ -208,16 +217,66 @@ def sync_to_db(self, decoded_documents):
208217
path=document_path,
209218
title=html.unescape(strip_tags(document["title"])),
210219
metadata=document,
211-
config=TSEARCH_CONFIG_LANGUAGES.get(
212-
self.lang[:2], DEFAULT_TEXT_SEARCH_CONFIG
213-
),
220+
config=get_search_config(self.lang),
214221
)
215-
for document in self.documents.all():
222+
for document in self.documents.exclude(
223+
metadata__parents=DocumentationCategory.WEBSITE
224+
):
216225
document.metadata["breadcrumbs"] = list(
217226
Document.objects.breadcrumbs(document).values("title", "path")
218227
)
219228
document.save(update_fields=("metadata",))
220229

230+
def sync_from_sitemap(self, force=False):
231+
from djangoproject.urls.www import sitemaps
232+
233+
if not self.is_dev:
234+
return
235+
236+
if force:
237+
Document.objects.filter(
238+
metadata__parents=DocumentationCategory.WEBSITE
239+
).delete()
240+
241+
doc_urls = set(
242+
Document.objects.filter(
243+
metadata__parents=DocumentationCategory.WEBSITE
244+
).values_list("path", flat=True)
245+
)
246+
247+
for sitemap in sitemaps.values():
248+
for url in sitemap().get_urls():
249+
path = url["location"]
250+
if path in doc_urls:
251+
continue
252+
try:
253+
page_html = fetch_html(path)
254+
except requests.RequestException:
255+
continue
256+
try:
257+
main_html = extract_inner_html(page_html, tag="main")
258+
title = extract_inner_html(page_html, tag="h1")
259+
except ValueError:
260+
continue
261+
Document.objects.create(
262+
release=self,
263+
path=path,
264+
title=title,
265+
metadata={
266+
"body": main_html,
267+
"breadcrumbs": [
268+
{
269+
"path": DocumentationCategory.WEBSITE,
270+
"title": "Website",
271+
},
272+
],
273+
"parents": DocumentationCategory.WEBSITE,
274+
"title": title,
275+
"toc": "",
276+
},
277+
config=get_search_config(self.lang),
278+
)
279+
221280

222281
def _clean_document_path(path):
223282
# We have to be a bit careful to reverse-engineer the correct
@@ -230,7 +289,9 @@ def _clean_document_path(path):
230289

231290

232291
def document_url(doc):
233-
if doc.path:
292+
if doc.metadata.get("parents") == DocumentationCategory.WEBSITE:
293+
return doc.path
294+
elif doc.path:
234295
kwargs = {
235296
"lang": doc.release.lang,
236297
"version": doc.release.version,
@@ -275,6 +336,14 @@ def search(self, query_text, release, document_category=None):
275336
config=models.F("config"),
276337
)
277338
base_filter = Q(release_id=release.id)
339+
if release.lang == settings.DEFAULT_LANGUAGE_CODE and not release.is_dev:
340+
dev_release = DocumentRelease.objects.get_by_version_and_lang(
341+
version="dev", lang=settings.DEFAULT_LANGUAGE_CODE
342+
)
343+
base_filter |= Q(
344+
release_id=dev_release.id,
345+
metadata__parents=DocumentationCategory.WEBSITE,
346+
)
278347
if document_category:
279348
base_filter &= Q(metadata__parents__startswith=document_category)
280349
base_qs = (

docs/search.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import requests
12
from django.contrib.postgres.search import SearchVector
23
from django.db.models import TextChoices
34
from django.db.models.fields.json import KeyTextTransform
@@ -67,10 +68,41 @@ class DocumentationCategory(TextChoices):
6768
TOPICS = "topics", _("Using Django")
6869
HOWTO = "howto", _("How-to guides")
6970
RELEASE_NOTES = "releases", _("Release notes")
71+
WEBSITE = "website", _("Django Website")
7072

7173
@classmethod
7274
def parse(cls, value, default=None):
7375
try:
7476
return cls(value)
7577
except ValueError:
7678
return None
79+
80+
81+
def fetch_html(url, timeout=10):
82+
"""
83+
Fetch the HTML of a page if status code is 200.
84+
Simulates a human browser and accepts only text/html.
85+
"""
86+
87+
headers = {
88+
"User-Agent": (
89+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
90+
"AppleWebKit/537.36 (KHTML, like Gecko) "
91+
"Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"
92+
),
93+
"Accept": "text/html",
94+
"Accept-Language": "en-US,en;q=0.9",
95+
}
96+
97+
response = requests.get(url, headers=headers, timeout=timeout)
98+
99+
if response.status_code == 200:
100+
content_type = response.headers.get("Content-Type", "")
101+
if "text/html" in content_type:
102+
return response.text
103+
else:
104+
raise requests.RequestException(f"Unexpected Content-Type: {content_type}")
105+
else:
106+
raise requests.RequestException(
107+
f"Failed to fetch {url}, status code: {response.status_code}"
108+
)

docs/templates/docs/search_results.html

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
{% extends "docs/doc.html" %}
22
{% load i18n docs %}
33

4-
{% block title %}{% translate "Search | Django documentation" %}{% endblock %}
4+
{% block title %}{% translate "Search" %}{% endblock %}
5+
{% block header %}{% endblock %}
56

67
{% block toc-wrapper %}{% endblock %}
78
{% block breadcrumbs-wrapper %}{% endblock %}
@@ -43,11 +44,11 @@ <h2>{% translate "No search query given" %}</h2>
4344
{% for result in page.object_list %}
4445
<dt>
4546
<h2 class="result-title">
46-
<a href="{% url 'document-detail' lang=result.release.lang version=result.release.version url=result.path host 'docs' %}{% if not start_sel in result.headline %}{{ result.highlight|fragment }}{% endif %}">{{ result.headline|safe }}</a>
47+
<a href="{{ result.get_absolute_url }}{% if not start_sel in result.headline %}{{ result.highlight|fragment }}{% endif %}">{{ result.headline|safe }}</a>
4748
</h2>
4849
<span class="meta breadcrumbs">
4950
{% for breadcrumb in result.breadcrumbs %}
50-
<a href="{% url 'document-detail' lang=result.release.lang version=result.release.version url=breadcrumb.path host 'docs' %}">{{ breadcrumb.title }}</a>{% if not forloop.last %} <span class="arrow">»</span>{% endif %}
51+
<a href="{{ result.get_absolute_url }}">{{ breadcrumb.title }}</a>{% if not forloop.last %} <span class="arrow">»</span>{% endif %}
5152
{% endfor %}
5253
</span>
5354
</dt>
@@ -60,7 +61,7 @@ <h2 class="result-title">
6061
<ul class="code-links">
6162
{% for name, value in result_code_links.items %}
6263
<li>
63-
<a href="{% url 'document-detail' lang=result.release.lang version=result.release.version url=result.path host 'docs' %}#{{ value.full_path }}">
64+
<a href="{{ result.get_absolute_url }}#{{ value.full_path }}">
6465
<div>
6566
<code>{{ name }}</code>
6667
{% if value.module_path %}<div class="meta">{{ value.module_path }}</div>{% endif %}
@@ -74,6 +75,9 @@ <h2 class="result-title">
7475
{% empty %}
7576
{% if active_category %}
7677
<dt>
78+
{% if active_category == "website" and lang != "en" %}
79+
<p>{% blocktranslate trimmed %}The website content can only be searched in English.{% endblocktranslate %}</p>
80+
{% endif %}
7781
<p>
7882
{% querystring category=None page=None as all_search %}
7983
{% blocktranslate trimmed %}Please try searching <a href="{{ all_search }}">all documentation results</a>.{% endblocktranslate %}

0 commit comments

Comments
 (0)