Skip to content

Commit

Permalink
Update analyses
Browse files Browse the repository at this point in the history
  • Loading branch information
johnsamuelwrites committed Aug 21, 2024
1 parent 444b6d1 commit 0b62970
Showing 1 changed file with 274 additions and 20 deletions.
294 changes: 274 additions & 20 deletions src/main/metadata.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,44 +2,59 @@
"cells": [
{
"cell_type": "markdown",
"id": "0c7ab5c7-f281-4552-9c94-e41f9384c741",
"id": "a1f411f1-9d7a-469a-b50a-c421077fb10d",
"metadata": {},
"source": [
"# Analyses Report Notebook"
]
},
{
"cell_type": "markdown",
"id": "543d5a68-c3a4-4e76-9380-7822b86818e8",
"id": "47d975ca-05c4-40a0-be0b-9842b8b942da",
"metadata": {},
"source": [
"## Importing necessary packages"
"## Installing and importing necessary packages"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bc0e4b12-1d23-425d-bcb5-9737e7c8e0b3",
"id": "c01ff3e2-9921-4557-9531-3d5052735e46",
"metadata": {},
"outputs": [],
"source": [
"!pip install extruct"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "82ee0e8e-e313-4460-8e53-e6eb7c27fcf1",
"metadata": {},
"outputs": [],
"source": [
"from analyse import HTMLTextAnalysis, WebsiteAnalysis\n",
"from metadata import get_title_from_link, replace_name\n",
"import pandas"
"from datetime import datetime\n",
"import urllib.request\n",
"import urllib.parse\n",
"import json\n",
"import pandas as pd\n",
"import re"
]
},
{
"cell_type": "markdown",
"id": "4cefb181-0b4d-4638-9843-b8b64dc121dc",
"id": "af1e60f2-8282-4684-8fce-5cc60c643c22",
"metadata": {},
"source": [
"## Create dataframe of complelte article list"
"## Create dataframe of complete article list"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5adee351-5c47-474c-9726-dc37919f5040",
"id": "bdea70f1-fc4a-4c66-a64b-5273bebcb655",
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -49,25 +64,35 @@
{
"cell_type": "code",
"execution_count": null,
"id": "481df64d-4c90-4fc3-b856-f78e7da0f2ed",
"id": "842f4526-3868-4155-89c8-3c248a55fc76",
"metadata": {},
"outputs": [],
"source": []
"source": [
"article_list_df.sort_values([\"first\"], ascending=False).head(10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a5ab3966-0fbf-4395-8cfa-a13992a3296c",
"id": "5c067b41-36f8-426c-9d79-01dd58ee3051",
"metadata": {},
"outputs": [],
"source": [
"article_list_df.sort_values([\"first\"], ascending=False).head(10)"
"article_list_df"
]
},
{
"cell_type": "markdown",
"id": "cdc1ebd4-7e7e-49a2-88ba-18c47fbe11ee",
"metadata": {},
"source": [
"### Filter out specific articles"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3ac3ec21-1381-4585-a1c6-ffdcfeaa45e4",
"id": "b4937bf2-3a15-4c02-b385-ff9624c8cc85",
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -81,7 +106,17 @@
{
"cell_type": "code",
"execution_count": null,
"id": "40b3cf73-8730-4d6d-b46a-8123427ca22b",
"id": "117afb6e-b8d1-4b43-bfe4-febf29724c56",
"metadata": {},
"outputs": [],
"source": [
"lang_df = article_list_df"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "03966382-c1ad-48e5-8049-2f721f89e728",
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -90,23 +125,29 @@
"lang_df.sort_values([\"first\"], ascending=False)"
]
},
{
"cell_type": "markdown",
"id": "39a5bfa3-14d7-4105-9170-7dc032408967",
"metadata": {},
"source": [
"## Get current titles of the articles"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "89029324-d905-4ef3-bf89-5ba62fa5702c",
"id": "8e1a8ed3-c585-44f9-8319-df616d3ecbb2",
"metadata": {},
"outputs": [],
"source": [
"from datetime import datetime\n",
"\n",
"lang_df = lang_df.assign(created=lang_df[\"first\"].apply(datetime.fromtimestamp))\n",
"lang_df"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5dada63d-745c-4d6d-9478-d5a3f08da532",
"id": "080cdce2-a325-4105-b83a-94dbe64dfcca",
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -117,14 +158,227 @@
{
"cell_type": "code",
"execution_count": null,
"id": "77ef428e-7642-486d-9a4f-2eb589ff8b62",
"id": "1101a311-274a-463f-a02f-a070948b8815",
"metadata": {},
"outputs": [],
"source": [
"sorted_lang_df.to_csv(\n",
" \"output.csv\", index=False, columns=[\"filepath\", \"title\", \"created\"], sep=\";\"\n",
")"
]
},
{
"cell_type": "markdown",
"id": "7fd50857-cdd8-4901-93bc-00e5cef4ee32",
"metadata": {},
"source": [
"## Helper functions for Querying Wikibase"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "23695b96-d7e1-4693-8bb6-940048812ee9",
"metadata": {},
"outputs": [],
"source": [
"def quote_query(query):\n",
" query = urllib.parse.quote(query, safe=\"\")\n",
" return query"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "73ecb65e-6e54-4f14-85e4-f6841a5b2866",
"metadata": {},
"outputs": [],
"source": [
"def query_wikibase(query):\n",
" url = \"https://jsamwrites.wikibase.cloud/query/sparql?format=json&query=\"+ quote_query(query)\n",
" response = urllib.request.urlopen(url)\n",
" responsedata = json.loads(response.read().decode(\"utf-8\"))\n",
" \n",
" array = []\n",
" for data in responsedata[\"results\"][\"bindings\"]:\n",
" array.append([data[\"item\"][\"value\"], data[\"title\"][\"value\"], data[\"url\"][\"value\"]])\n",
" \n",
" wikibase_dataframe = pd.DataFrame(array, columns=[\"item\", \"title\", \"url\"])\n",
" return wikibase_dataframe"
]
},
{
"cell_type": "markdown",
"id": "17b58dbf-02b5-42b0-9b2c-90462a86cfde",
"metadata": {},
"source": [
"## SPARQL query for getting the existing article list from Wikibase"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2110c9e7-52de-4095-a616-00d4c38b0eea",
"metadata": {},
"outputs": [],
"source": [
"query = \"\"\"PREFIX wd: <https://jsamwrites.wikibase.cloud/entity/>\n",
"PREFIX wdt: <https://jsamwrites.wikibase.cloud/prop/direct/>\n",
"\n",
"SELECT DISTINCT ?item ?title ?url{\n",
" ?item wdt:P3 ?url;\n",
" wdt:P27 ?title;\n",
" wdt:P10 ?time.\n",
"}\n",
"ORDER by ?time\"\"\"\n",
"\n",
"wikibase_dataframe = query_wikibase(query)\n",
"wikibase_dataframe"
]
},
{
"cell_type": "markdown",
"id": "f64289c1-d7a7-48a8-8f92-29d1b41b2142",
"metadata": {},
"source": [
"Replace relative filepath with complete URL"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3678ca41-e6c9-4f3c-96f1-f08ebfaceb7a",
"metadata": {},
"outputs": [],
"source": [
"sorted_lang_df[\"filepath\"].replace(\"\\.\\.\\/\\.\\.\", \"https://johnsamuel.info\",inplace=True,regex=True)\n",
"sorted_lang_df[\"filepath\"].replace(\" \", \"%20\" ,inplace=True,regex=True)\n",
"sorted_lang_df[\"filepath\"]"
]
},
{
"cell_type": "markdown",
"id": "a8a61892-d673-4766-9532-92df4b863ec0",
"metadata": {},
"source": [
"## Find the missing articles that are not yet on Wikibase"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a3c6acab-4e5e-4232-982d-53c88354888f",
"metadata": {},
"outputs": [],
"source": [
"missing_articles = sorted_lang_df[~sorted_lang_df[\"filepath\"].isin(wikibase_dataframe[\"url\"])]\n",
"missing_articles"
]
},
{
"cell_type": "markdown",
"id": "dfb0b3c8-b491-48bc-a4a9-b997ae6eafd6",
"metadata": {},
"source": [
"### Generate QuickStatements for the missing articles"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "365e6de8-8eef-4897-88d3-55f8a030bf60",
"metadata": {},
"outputs": [],
"source": [
"with open(\"quickstatements.csv\",\"w\") as qw:\n",
" for index,article in missing_articles.iterrows():\n",
" qw.write(\"CREATE\\n\")\n",
" qw.write('LAST|Den|\"web page\"\\n')\n",
" qw.write('LAST|Dfr|\"page web\"\\n')\n",
" qw.write(f\"LAST|L{article['language']}|\\\"{article['title']}\\\"\\n\")\n",
" qw.write(f\"LAST|P27|{article['language']}:\\\"{article['title']}\\\"\\n\")\n",
" qw.write(\"LAST|P17|Q48\\n\")\n",
" qw.write(\"LAST|P8|Q45\\n\")\n",
" qw.write(f\"LAST|P3|\\\"{re.sub(' ', '%20', str(article['filepath']))}\\\"\\n\")\n",
" qw.write(\"LAST|P13|Q1041\\n\")\n",
" qw.write(\"LAST|P15|Q38\\n\")\n",
" qw.write(f\"LAST|P10|+{re.sub(r' ..:..:..','T00:00:00', str(article['created']))}Z/11\\n\\n\")"
]
},
{
"cell_type": "markdown",
"id": "e241102b-b41a-40b4-b42b-d8e084f6f47a",
"metadata": {},
"source": [
"## Verification of the titles\n",
"Verify whether the titles on the Wikibase are the same as those right now"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a9a10608-7428-4729-b0fa-d3e8042b9921",
"metadata": {},
"outputs": [],
"source": [
"query = \"\"\"PREFIX wd: <https://jsamwrites.wikibase.cloud/entity/>\n",
"PREFIX wdt: <https://jsamwrites.wikibase.cloud/prop/direct/>\n",
"\n",
"SELECT DISTINCT ?item ?title ?url{\n",
" ?item wdt:P3 ?url;\n",
" wdt:P27 ?title;\n",
" wdt:P10 ?time.\n",
"}\n",
"ORDER by ?time\"\"\"\n",
"\n",
"wikibase_dataframe = query_wikibase(query)\n",
"wikibase_dataframe\n",
"\n",
"sorted_lang_df[\"filepath\"].replace(\"\\.\\.\\/\\.\\.\", \"https://johnsamuel.info\",inplace=True,regex=True)\n",
"sorted_lang_df[\"filepath\"].replace(\" \", \"%20\" ,inplace=True,regex=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f8858eb4-9b3b-4899-bf96-928a10c2d15a",
"metadata": {},
"outputs": [],
"source": [
"stale_title_list = []\n",
"for index,article in wikibase_dataframe.iterrows():\n",
" url = article[\"url\"]\n",
" wikibase_article = sorted_lang_df[sorted_lang_df[\"filepath\"]==url]\n",
" wikibase_title = wikibase_article[\"title\"]\n",
" if len(wikibase_title) > 0:\n",
" wikibase_title = wikibase_title.values[0]\n",
" if(wikibase_title != str(article[\"title\"])):\n",
" print(\"Incorrect\", wikibase_title, str(article[\"title\"]))\n",
" stale_title_list.append((article[\"item\"].replace(\"https://jsamwrites.wikibase.cloud/entity/\", \"\"), str(wikibase_article[\"title\"].values[0]), str(wikibase_article[\"language\"].values[0])))\n",
" else:\n",
" #pass\n",
" print(url, wikibase_article)\n"
]
},
{
"cell_type": "markdown",
"id": "5241a46a-2a38-4497-9c0e-f223f7a554bc",
"metadata": {},
"source": [
"## Generate QuickStatements for correcting titles"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6f0fd10a-180e-42f6-9aa2-464ad41116c5",
"metadata": {},
"outputs": [],
"source": [
"with open(\"quickstatements.csv\",\"w\") as qw:\n",
" for values in stale_title_list:\n",
" qw.write(f\"{values[0]}|P27|{values[2]}:\\\"{values[1]}\\\"\\n\")"
]
}
],
"metadata": {
Expand All @@ -143,7 +397,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
"version": "3.10.12"
}
},
"nbformat": 4,
Expand Down

0 comments on commit 0b62970

Please sign in to comment.