diff --git a/CHANGELOG.md b/CHANGELOG.md index ff0c4ab6..3a0f331a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,8 @@ The format is based on Keep a Changelog, and this project adheres to Semantic Ve ### Fixed - Replace "www.grants.gov/web/grants/search-grants.html" with "grants.gov/search-grants" +- Replace "www.grants.gov/web/grants/forms/sf-424-family.html" with "grants.gov/forms/forms-repository/sf-424-family" +- Replace "www.cdc.gov/grants/dictionary/index.html" with "www.cdc.gov/grants/dictionary/index.html" ## [1.37.0] - 2023-11-21 diff --git a/bloom_nofos/nofos/nofo.py b/bloom_nofos/nofos/nofo.py index d182d342..141bcd6c 100644 --- a/bloom_nofos/nofos/nofo.py +++ b/bloom_nofos/nofos/nofo.py @@ -37,11 +37,29 @@ def replace_chars(file_content): ("\u00A8", "\u25FB"), # from () U+007F DELETE to (◻) U+25FB WHITE MEDIUM SQUARE ("\u007F", "\u25FB"), - # replace this "page not found" url with the new one + ] + + for _from, _to in replacement_chars: + file_content = file_content.replace(_from, _to) + + return file_content + + +def replace_links(file_content): + # grants.gov/web links are broken and don't redirect _and_ say they are 200 🙄 + replacement_chars = [ ( "www.grants.gov/web/grants/search-grants.html", "grants.gov/search-grants", ), + ( + "www.grants.gov/web/grants/forms/sf-424-family.html", + "grants.gov/forms/forms-repository/sf-424-family", + ), + ( + "www.cdc.gov/grants/dictionary/index.html", + "www.cdc.gov/grants/dictionary-of-terms/", + ), ] for _from, _to in replacement_chars: diff --git a/bloom_nofos/nofos/views.py b/bloom_nofos/nofos/views.py index 915689de..abb5d02b 100644 --- a/bloom_nofos/nofos/views.py +++ b/bloom_nofos/nofos/views.py @@ -85,6 +85,7 @@ preserve_table_heading_links, remove_google_tracking_info_from_links, replace_chars, + replace_links, replace_src_for_inline_images, suggest_all_nofo_fields, suggest_nofo_title, @@ -297,8 +298,10 @@ def nofo_import(request, pk=None): ) return redirect(view_path, **kwargs) - # replace problematic characters on import + # replace problematic characters/links on import cleaned_content = replace_chars(file_content) + cleaned_content = replace_links(file_content) + soup = BeautifulSoup(cleaned_content, "html.parser") # Parse the cleaned HTML soup = add_body_if_no_body(soup)