diff --git a/KerbalStuff/blueprints/api.py b/KerbalStuff/blueprints/api.py index b8e1eabb..c8539df6 100644 --- a/KerbalStuff/blueprints/api.py +++ b/KerbalStuff/blueprints/api.py @@ -22,6 +22,7 @@ ModList from ..search import search_mods, search_users, typeahead_mods, get_mod_score from ..thumbnail import thumb_path_from_background_path +from ..celery import update_mod_similarities api = Blueprint('api', __name__) @@ -545,6 +546,8 @@ def accept_grant_mod(mod_id: int) -> Tuple[Dict[str, Any], int]: mod = _get_mod(mod_id) author = _get_mod_pending_author(mod) author.accepted = True + db.commit() + update_mod_similarities.delay([mod.id]) notify_ckan(mod, 'co-author-added') return {'error': False}, 200 @@ -581,6 +584,8 @@ def revoke_mod(mod_id: int) -> Tuple[Dict[str, Any], int]: author = [a for a in mod.shared_authors if a.user == new_user][0] mod.shared_authors = [a for a in mod.shared_authors if a.user != current_user] db.delete(author) + db.commit() + update_mod_similarities.delay([mod.id]) notify_ckan(mod, 'co-author-removed') return {'error': False}, 200 @@ -698,6 +703,7 @@ def create_mod() -> Tuple[Dict[str, Any], int]: db.commit() mod.score = get_mod_score(mod) db.commit() + update_mod_similarities.delay([mod.id]) set_game_info(game) send_to_ckan(mod) return { diff --git a/KerbalStuff/blueprints/mods.py b/KerbalStuff/blueprints/mods.py index c1c28339..bb06cf9c 100644 --- a/KerbalStuff/blueprints/mods.py +++ b/KerbalStuff/blueprints/mods.py @@ -33,6 +33,7 @@ Featured, Media, GameVersion, Game, Following from ..search import get_mod_score from ..thumbnail import thumb_path_from_background_path +from ..celery import update_mod_similarities mods = Blueprint('mods', __name__) @@ -295,6 +296,8 @@ def edit_mod(mod_id: int, mod_name: str) -> Union[str, werkzeug.wrappers.Respons if not mod.published: newly_published = True mod.published = True + db.commit() + update_mod_similarities.delay([mod.id]) if ckan is None: ckan = False else: @@ -499,6 +502,8 @@ def publish(mod_id: int, mod_name: str) -> werkzeug.wrappers.Response: mod.published = True mod.updated = datetime.now() mod.score = get_mod_score(mod) + db.commit() + update_mod_similarities.delay([mod.id]) send_to_ckan(mod) return redirect(url_for("mods.mod", mod_id=mod.id, mod_name=mod.name)) diff --git a/KerbalStuff/celery.py b/KerbalStuff/celery.py index cf8d4d3c..74b5a5f1 100644 --- a/KerbalStuff/celery.py +++ b/KerbalStuff/celery.py @@ -11,6 +11,7 @@ from .objects import Mod from .search import get_mod_score from .ckan import import_ksp_versions_from_ckan +from .similarity import update_similar_mods app = Celery("tasks", broker=_cfg("redis-connection")) @@ -118,6 +119,14 @@ def ckan_version_import() -> None: if game_id > 0: import_ksp_versions_from_ckan(game_id) + +@app.task +@with_session +def update_mod_similarities(mod_ids: List[int]) -> None: + for mod_id in mod_ids: + update_similar_mods(Mod.query.get(mod_id)) + + # to debug this: # * add PTRACE capability to celery container via docker-compose.yaml # celery: diff --git a/KerbalStuff/objects.py b/KerbalStuff/objects.py index 99180692..80478127 100644 --- a/KerbalStuff/objects.py +++ b/KerbalStuff/objects.py @@ -2,7 +2,7 @@ import os.path from datetime import datetime import re -from typing import Optional +from typing import Optional, Dict, Set import bcrypt from flask import url_for @@ -14,6 +14,7 @@ from . import thumbnail from .database import Base +from .str_similarity import meaningful_words, words_similarity class Following(Base): # type: ignore @@ -235,6 +236,7 @@ class Mod(Base): # type: ignore followings = relationship('Following', back_populates='mod') # List of users that follow this mods followers = association_proxy('followings', 'user') + similar_mods = association_proxy('similarities', 'other_mod') def background_thumb(self) -> Optional[str]: return thumbnail.get_or_create(self) @@ -251,6 +253,20 @@ def background_url(self, protocol: Optional[str], cdn_domain: Optional[str]) -> else: return url_for('mods.mod_background', mod_id=self.id, mod_name=self.name) + def get_author_names(self) -> Set[str]: + self._author_names: Set[str] + if not hasattr(self, '_author_names'): + self._author_names = {self.user.username, *(a.username for a in self.shared_authors)} + return self._author_names + + def get_words(self, prop_name: str) -> Set[str]: + """ Only parse the strings once to speed up mass-compares """ + if not hasattr(self, '_words'): + self._words: Dict[str, Set[str]] = {} + if prop_name not in self._words: + self._words[prop_name] = meaningful_words(getattr(self, prop_name, '')) + return self._words[prop_name] + def __repr__(self) -> str: return '' % (self.id, self.name) @@ -300,6 +316,36 @@ def __repr__(self) -> str: return '' % self.user_id +class ModSimilarity(Base): # type: ignore + __tablename__ = 'mod_similarity' + __table_args__ = (PrimaryKeyConstraint('main_mod_id', 'other_mod_id', name='pk_mods'), ) + similarity = Column(Float(precision=5), nullable=False) + main_mod_id = Column(Integer, ForeignKey('mod.id', ondelete='CASCADE'), nullable=False) + main_mod = relationship('Mod', + foreign_keys=main_mod_id, + backref=backref('similarities', + passive_deletes=True, + order_by=similarity.desc())) + other_mod_id = Column(Integer, ForeignKey('mod.id', ondelete='CASCADE'), nullable=False) + other_mod = relationship('Mod', foreign_keys=other_mod_id) + + Index('ix_mod_similarity_main_mod_similarity', main_mod_id, similarity.desc()) + + WORD_PROPS = ['name', 'short_description', 'description'] + + def __init__(self, main_mod: Mod, other_mod: Mod) -> None: + self.main_mod_id = main_mod.id + self.other_mod_id = other_mod.id + self.similarity = (0.1 * words_similarity(main_mod.get_author_names(), + other_mod.get_author_names()) + + sum(words_similarity(main_mod.get_words(prop_name), + other_mod.get_words(prop_name)) + for prop_name in self.WORD_PROPS)) + + def __repr__(self) -> str: + return f'' + + class DownloadEvent(Base): # type: ignore __tablename__ = 'downloadevent' id = Column(Integer, primary_key=True) diff --git a/KerbalStuff/similarity.py b/KerbalStuff/similarity.py new file mode 100644 index 00000000..731d35b0 --- /dev/null +++ b/KerbalStuff/similarity.py @@ -0,0 +1,47 @@ +from heapq import nlargest +from typing import List + +from .objects import Mod, ModSimilarity + + +def find_most_similar(mod: Mod, how_many: int = 6) -> List[ModSimilarity]: + get_sim = lambda mod_sim: mod_sim.similarity + return sorted(nlargest(how_many, + # Zero similarity means nothing at all in common, so skip those + filter(lambda mod_sim: mod_sim.similarity > 0, + (ModSimilarity(mod, other_mod) + for other_mod in + Mod.query.filter(Mod.published, + Mod.game_id == mod.game_id, + Mod.id != mod.id))), + key=get_sim), + key=get_sim, + reverse=True) + + +def update_similar_mods(mod: Mod, how_many: int = 6) -> None: + if not mod.published: + mod.similarities = [] + else: + most_similar = find_most_similar(mod, how_many) + # Remove rows for mods that are no longer among the most similar + for mod_sim in mod.similarities: + if not any(mod_sim.other_mod_id == other_sim.other_mod_id + for other_sim in most_similar): + ModSimilarity.query\ + .filter(ModSimilarity.main_mod_id == mod_sim.main_mod_id, + ModSimilarity.other_mod_id == mod_sim.other_mod_id)\ + .delete() + for mod_sim in most_similar: + match = [other_sim for other_sim in mod.similarities + if mod_sim.other_mod_id == other_sim.other_mod_id] + if match: + # Update existing rows for mods that are still similar + match[0].similarity = mod_sim.similarity + # Update the row with swapped IDs, if any + for other_sim in match[0].other_mod.similarities: + if other_sim.other_mod_id == mod_sim.main_mod_id: + other_sim.similarity = mod_sim.similarity + else: + # Add new rows for newly similar mods + mod.similarities.append(mod_sim) diff --git a/KerbalStuff/str_similarity.py b/KerbalStuff/str_similarity.py new file mode 100644 index 00000000..6f012fc9 --- /dev/null +++ b/KerbalStuff/str_similarity.py @@ -0,0 +1,55 @@ +import re +from typing import Set, Iterable + + +# Split words on one or more non-alphanumerics +WORD_SPLIT = re.compile(r'[^a-zA-Z0-9]+') + +# Split up pieces of StudlyCapsStrings +STUDLY_SPLIT = re.compile(r'(?=[A-Z])') + +# English words that do not convey meaning about the context +# We care about things like "rocket" and "propellant" and "deltaV" +MEANINGLESS = { + 'the', 'an', 'this', 'these', 'that', 'those', + 'and', 'or', 'but', 'however', + 'as', 'such', 'than', 'there', + 'me', 'my', 'we', 'us', 'our', + 'you', 'your', 'he', 'him', 'she', 'her', 'it', + 'they', 'them', + 'to', 'from', 'in', 'on', 'for', 'with', 'of', 'into', 'at', 'by', + 'what', 'because', 'then', + 'is', 'be', 'been', 'are', 'get', 'getting', 'has', 'have', 'come', + 'do', 'does', + 'will', 'make', 'work', 'also', 'more', + 'should', 'so', 'some', 'like', 'likely', 'can', 'seems', + 'really', 'very', 'each', 'yup', 'which', + 've', 're', + 'accommodate', 'manner', 'therefore', 'ever', 'probably', 'almost', + 'something', + 'mod', 'pack', 'contains', 'ksp', + 'http', 'https', 'www', 'youtube', 'imgur', 'com', + 'github', 'githubusercontent', + 'forum', 'kerbalspaceprogram', 'index', 'thread', 'topic', 'php', + 'kerbal', 'space', 'continued', 'revived', 'updated', 'redux', + 'inc', 'plus', +} + + +def split_with_acronyms(s: str) -> Iterable[str]: + words = WORD_SPLIT.split(s) + yield from words + for w in words: + yield from STUDLY_SPLIT.split(w) + + +def meaningful_words(s: str) -> Set[str]: + return set(map(lambda w: w.lower(), + filter(lambda w: len(w) > 1 and not w.isnumeric(), + split_with_acronyms(s)))) - MEANINGLESS + + +def words_similarity(words1: Set[str], words2: Set[str]) -> float: + in_both = words1.intersection(words2) + all_words = words1 | words2 + return len(in_both) / len(all_words) if all_words else 0 diff --git a/alembic/versions/2021_12_15_23_06_02-bbcce95b6e79.py b/alembic/versions/2021_12_15_23_06_02-bbcce95b6e79.py new file mode 100644 index 00000000..03305633 --- /dev/null +++ b/alembic/versions/2021_12_15_23_06_02-bbcce95b6e79.py @@ -0,0 +1,47 @@ +"""Create mod_similarity table + +Revision ID: bbcce95b6e79 +Revises: 3fb8a6e2e0a5 +Create Date: 2021-12-16 05:06:06.312797 + +""" + +# revision identifiers, used by Alembic. +revision = 'bbcce95b6e79' +down_revision = '3fb8a6e2e0a5' + +from alembic import op +from alembic.op import create_table, drop_table +import sqlalchemy as sa + +from KerbalStuff.celery import update_mod_similarities + +Base = sa.ext.declarative.declarative_base() + +class Mod(Base): # type: ignore + __tablename__ = 'mod' + id = sa.Column(sa.Integer, primary_key=True) + published = sa.Column(sa.Boolean, default=False) + + +def upgrade() -> None: + create_table('mod_similarity', + sa.Column('main_mod_id', sa.Integer(), nullable=False), + sa.Column('other_mod_id', sa.Integer(), nullable=False), + sa.Column('similarity', sa.Float(precision=5), nullable=False), + sa.ForeignKeyConstraint(['main_mod_id'], ['mod.id'], ondelete='CASCADE'), + sa.ForeignKeyConstraint(['other_mod_id'], ['mod.id'], ondelete='CASCADE'), + sa.PrimaryKeyConstraint('main_mod_id', 'other_mod_id', name='pk_mods')) + op.create_index('ix_mod_similarity_main_mod_similarity', + 'mod_similarity', ['main_mod_id', sa.text('similarity DESC')], unique=False) + + # Ask Celery to build the similarity rows for existing published mods + update_mod_similarities.delay([mod_id for mod_id, in + sa.orm.Session(bind=op.get_bind()) + .query(Mod) + .filter(Mod.published) + .with_entities(Mod.id)]) + +def downgrade() -> None: + op.drop_index('ix_mod_similarity_main_mod_similarity', table_name='mod_similarity') + drop_table('mod_similarity') diff --git a/templates/mod.html b/templates/mod.html index ff129701..7c6204df 100644 --- a/templates/mod.html +++ b/templates/mod.html @@ -394,6 +394,21 @@

Export Raw Stats

+{% if mod.similar_mods %} +
+
+

Similar-ish Mods

+
+
+
+
+ {% set similar_mods = mod.similar_mods[:6] -%} + {%- for mod in similar_mods -%} + {%- include "mod-box.html" -%} + {%- endfor %} +
+
+{% endif %} {% if editable %}