Show similar mods on mod page

KSP-SpaceDock · Mar 12, 2022 · 03acf91 · 03acf91
1 parent b632cb8
commit 03acf91
Show file tree

Hide file tree

Showing 9 changed files with 315 additions and 1 deletion.
diff --git a/KerbalStuff/blueprints/api.py b/KerbalStuff/blueprints/api.py
@@ -22,6 +22,7 @@
     ModList
 from ..search import search_mods, search_users, typeahead_mods, get_mod_score
 from ..thumbnail import thumb_path_from_background_path
+from ..celery import update_mod_similarities
 
 api = Blueprint('api', __name__)
 
@@ -545,6 +546,8 @@ def accept_grant_mod(mod_id: int) -> Tuple[Dict[str, Any], int]:
     mod = _get_mod(mod_id)
     author = _get_mod_pending_author(mod)
     author.accepted = True
+    db.commit()
+    update_mod_similarities.delay([mod.id])
     notify_ckan(mod, 'co-author-added')
     return {'error': False}, 200
 
@@ -581,6 +584,8 @@ def revoke_mod(mod_id: int) -> Tuple[Dict[str, Any], int]:
     author = [a for a in mod.shared_authors if a.user == new_user][0]
     mod.shared_authors = [a for a in mod.shared_authors if a.user != current_user]
     db.delete(author)
+    db.commit()
+    update_mod_similarities.delay([mod.id])
     notify_ckan(mod, 'co-author-removed')
     return {'error': False}, 200
 
@@ -698,6 +703,7 @@ def create_mod() -> Tuple[Dict[str, Any], int]:
         db.commit()
         mod.score = get_mod_score(mod)
         db.commit()
+        update_mod_similarities.delay([mod.id])
         set_game_info(game)
         send_to_ckan(mod)
         return {

diff --git a/KerbalStuff/blueprints/mods.py b/KerbalStuff/blueprints/mods.py
@@ -33,6 +33,7 @@
     Featured, Media, GameVersion, Game, Following
 from ..search import get_mod_score
 from ..thumbnail import thumb_path_from_background_path
+from ..celery import update_mod_similarities
 
 mods = Blueprint('mods', __name__)
 
@@ -295,6 +296,8 @@ def edit_mod(mod_id: int, mod_name: str) -> Union[str, werkzeug.wrappers.Respons
             if not mod.published:
                 newly_published = True
                 mod.published = True
+        db.commit()
+        update_mod_similarities.delay([mod.id])
         if ckan is None:
             ckan = False
         else:
@@ -499,6 +502,8 @@ def publish(mod_id: int, mod_name: str) -> werkzeug.wrappers.Response:
     mod.published = True
     mod.updated = datetime.now()
     mod.score = get_mod_score(mod)
+    db.commit()
+    update_mod_similarities.delay([mod.id])
     send_to_ckan(mod)
     return redirect(url_for("mods.mod", mod_id=mod.id, mod_name=mod.name))
 

diff --git a/KerbalStuff/celery.py b/KerbalStuff/celery.py
@@ -11,6 +11,7 @@
 from .objects import Mod
 from .search import get_mod_score
 from .ckan import import_ksp_versions_from_ckan
+from .similarity import update_similar_mods
 
 app = Celery("tasks", broker=_cfg("redis-connection"))
 
@@ -118,6 +119,14 @@ def ckan_version_import() -> None:
     if game_id > 0:
         import_ksp_versions_from_ckan(game_id)
 
+
+@app.task
+@with_session
+def update_mod_similarities(mod_ids: List[int]) -> None:
+    for mod_id in mod_ids:
+        update_similar_mods(Mod.query.get(mod_id))
+
+
 # to debug this:
 # * add PTRACE capability to celery container via docker-compose.yaml
 #   celery:

diff --git a/KerbalStuff/objects.py b/KerbalStuff/objects.py
@@ -2,7 +2,7 @@
 import os.path
 from datetime import datetime
 import re
-from typing import Optional
+from typing import Optional, Dict, Set
 
 import bcrypt
 from flask import url_for
@@ -14,6 +14,7 @@
 
 from . import thumbnail
 from .database import Base
+from .str_similarity import meaningful_words, words_similarity
 
 
 class Following(Base):  # type: ignore
@@ -235,6 +236,7 @@ class Mod(Base):  # type: ignore
     followings = relationship('Following', back_populates='mod')
     # List of users that follow this mods
     followers = association_proxy('followings', 'user')
+    similar_mods = association_proxy('similarities', 'other_mod')
 
     def background_thumb(self) -> Optional[str]:
         return thumbnail.get_or_create(self)
@@ -251,6 +253,20 @@ def background_url(self, protocol: Optional[str], cdn_domain: Optional[str]) ->
         else:
             return url_for('mods.mod_background', mod_id=self.id, mod_name=self.name)
 
+    def get_author_names(self) -> Set[str]:
+        self._author_names: Set[str]
+        if not hasattr(self, '_author_names'):
+            self._author_names = {self.user.username, *(a.username for a in self.shared_authors)}
+        return self._author_names
+
+    def get_words(self, prop_name: str) -> Set[str]:
+        """ Only parse the strings once to speed up mass-compares """
+        if not hasattr(self, '_words'):
+            self._words: Dict[str, Set[str]] = {}
+        if prop_name not in self._words:
+            self._words[prop_name] = meaningful_words(getattr(self, prop_name, ''))
+        return self._words[prop_name]
+
     def __repr__(self) -> str:
         return '<Mod %r %r>' % (self.id, self.name)
 
@@ -300,6 +316,36 @@ def __repr__(self) -> str:
         return '<SharedAuthor %r>' % self.user_id
 
 
+class ModSimilarity(Base):  # type: ignore
+    __tablename__ = 'mod_similarity'
+    __table_args__ = (PrimaryKeyConstraint('main_mod_id', 'other_mod_id', name='pk_mods'), )
+    similarity = Column(Float(precision=5), nullable=False)
+    main_mod_id = Column(Integer, ForeignKey('mod.id', ondelete='CASCADE'), nullable=False)
+    main_mod = relationship('Mod',
+                            foreign_keys=main_mod_id,
+                            backref=backref('similarities',
+                                            passive_deletes=True,
+                                            order_by=similarity.desc()))
+    other_mod_id = Column(Integer, ForeignKey('mod.id', ondelete='CASCADE'), nullable=False)
+    other_mod = relationship('Mod', foreign_keys=other_mod_id)
+
+    Index('ix_mod_similarity_main_mod_similarity', main_mod_id, similarity.desc())
+
+    WORD_PROPS = ['name', 'short_description', 'description']
+
+    def __init__(self, main_mod: Mod, other_mod: Mod) -> None:
+        self.main_mod_id = main_mod.id
+        self.other_mod_id = other_mod.id
+        self.similarity = (0.1 * words_similarity(main_mod.get_author_names(),
+                                                  other_mod.get_author_names())
+                           + sum(words_similarity(main_mod.get_words(prop_name),
+                                                  other_mod.get_words(prop_name))
+                                 for prop_name in self.WORD_PROPS))
+
+    def __repr__(self) -> str:
+        return f'<Mod Similarity {self.main_mod_id} {self.other_mod_id}>'
+
+
 class DownloadEvent(Base):  # type: ignore
     __tablename__ = 'downloadevent'
     id = Column(Integer, primary_key=True)

diff --git a/KerbalStuff/similarity.py b/KerbalStuff/similarity.py
@@ -0,0 +1,47 @@
+from heapq import nlargest
+from typing import List
+
+from .objects import Mod, ModSimilarity
+
+
+def find_most_similar(mod: Mod, how_many: int = 6) -> List[ModSimilarity]:
+    get_sim = lambda mod_sim: mod_sim.similarity
+    return sorted(nlargest(how_many,
+                           # Zero similarity means nothing at all in common, so skip those
+                           filter(lambda mod_sim: mod_sim.similarity > 0,
+                                  (ModSimilarity(mod, other_mod)
+                                   for other_mod in
+                                   Mod.query.filter(Mod.published,
+                                                    Mod.game_id == mod.game_id,
+                                                    Mod.id != mod.id))),
+                           key=get_sim),
+                  key=get_sim,
+                  reverse=True)
+
+
+def update_similar_mods(mod: Mod, how_many: int = 6) -> None:
+    if not mod.published:
+        mod.similarities = []
+    else:
+        most_similar = find_most_similar(mod, how_many)
+        # Remove rows for mods that are no longer among the most similar
+        for mod_sim in mod.similarities:
+            if not any(mod_sim.other_mod_id == other_sim.other_mod_id
+                       for other_sim in most_similar):
+               ModSimilarity.query\
+                   .filter(ModSimilarity.main_mod_id == mod_sim.main_mod_id,
+                           ModSimilarity.other_mod_id == mod_sim.other_mod_id)\
+                   .delete()
+        for mod_sim in most_similar:
+            match = [other_sim for other_sim in mod.similarities
+                     if mod_sim.other_mod_id == other_sim.other_mod_id]
+            if match:
+                # Update existing rows for mods that are still similar
+                match[0].similarity = mod_sim.similarity
+                # Update the row with swapped IDs, if any
+                for other_sim in match[0].other_mod.similarities:
+                    if other_sim.other_mod_id == mod_sim.main_mod_id:
+                        other_sim.similarity = mod_sim.similarity
+            else:
+                # Add new rows for newly similar mods
+                mod.similarities.append(mod_sim)
diff --git a/KerbalStuff/str_similarity.py b/KerbalStuff/str_similarity.py
@@ -0,0 +1,55 @@
+import re
+from typing import Set, Iterable
+
+
+# Split words on one or more non-alphanumerics
+WORD_SPLIT = re.compile(r'[^a-zA-Z0-9]+')
+
+# Split up pieces of StudlyCapsStrings
+STUDLY_SPLIT = re.compile(r'(?=[A-Z])')
+
+# English words that do not convey meaning about the context
+# We care about things like "rocket" and "propellant" and "deltaV"
+MEANINGLESS = {
+    'the', 'an', 'this', 'these', 'that', 'those',
+    'and', 'or', 'but', 'however',
+    'as', 'such', 'than', 'there',
+    'me', 'my', 'we', 'us', 'our',
+    'you', 'your', 'he', 'him', 'she', 'her', 'it',
+    'they', 'them',
+    'to', 'from', 'in', 'on', 'for', 'with', 'of', 'into', 'at', 'by',
+    'what', 'because', 'then',
+    'is', 'be', 'been', 'are', 'get', 'getting', 'has', 'have', 'come',
+    'do', 'does',
+    'will', 'make', 'work', 'also', 'more',
+    'should', 'so', 'some', 'like', 'likely', 'can', 'seems',
+    'really', 'very', 'each', 'yup', 'which',
+    've', 're',
+    'accommodate', 'manner', 'therefore', 'ever', 'probably', 'almost',
+    'something',
+    'mod', 'pack', 'contains', 'ksp',
+    'http', 'https', 'www', 'youtube', 'imgur', 'com',
+    'github', 'githubusercontent',
+    'forum', 'kerbalspaceprogram', 'index', 'thread', 'topic', 'php',
+    'kerbal', 'space', 'continued', 'revived', 'updated', 'redux',
+    'inc', 'plus',
+}
+
+
+def split_with_acronyms(s: str) -> Iterable[str]:
+    words = WORD_SPLIT.split(s)
+    yield from words
+    for w in words:
+        yield from STUDLY_SPLIT.split(w)
+
+
+def meaningful_words(s: str) -> Set[str]:
+    return set(map(lambda w: w.lower(),
+                   filter(lambda w: len(w) > 1 and not w.isnumeric(),
+                          split_with_acronyms(s)))) - MEANINGLESS
+
+
+def words_similarity(words1: Set[str], words2: Set[str]) -> float:
+    in_both = words1.intersection(words2)
+    all_words = words1 | words2
+    return len(in_both) / len(all_words) if all_words else 0
diff --git a/alembic/versions/2021_12_15_23_06_02-bbcce95b6e79.py b/alembic/versions/2021_12_15_23_06_02-bbcce95b6e79.py
@@ -0,0 +1,47 @@
+"""Create mod_similarity table
+
+Revision ID: bbcce95b6e79
+Revises: 3fb8a6e2e0a5
+Create Date: 2021-12-16 05:06:06.312797
+
+"""
+
+# revision identifiers, used by Alembic.
+revision = 'bbcce95b6e79'
+down_revision = '3fb8a6e2e0a5'
+
+from alembic import op
+from alembic.op import create_table, drop_table
+import sqlalchemy as sa
+
+from KerbalStuff.celery import update_mod_similarities
+
+Base = sa.ext.declarative.declarative_base()
+
+class Mod(Base):  # type: ignore
+    __tablename__ = 'mod'
+    id = sa.Column(sa.Integer, primary_key=True)
+    published = sa.Column(sa.Boolean, default=False)
+
+
+def upgrade() -> None:
+    create_table('mod_similarity',
+                 sa.Column('main_mod_id', sa.Integer(), nullable=False),
+                 sa.Column('other_mod_id', sa.Integer(), nullable=False),
+                 sa.Column('similarity', sa.Float(precision=5), nullable=False),
+                 sa.ForeignKeyConstraint(['main_mod_id'], ['mod.id'], ondelete='CASCADE'),
+                 sa.ForeignKeyConstraint(['other_mod_id'], ['mod.id'], ondelete='CASCADE'),
+                 sa.PrimaryKeyConstraint('main_mod_id', 'other_mod_id', name='pk_mods'))
+    op.create_index('ix_mod_similarity_main_mod_similarity',
+                    'mod_similarity', ['main_mod_id', sa.text('similarity DESC')], unique=False)
+
+    # Ask Celery to build the similarity rows for existing published mods
+    update_mod_similarities.delay([mod_id for mod_id, in
+                                   sa.orm.Session(bind=op.get_bind())
+                                         .query(Mod)
+                                         .filter(Mod.published)
+                                         .with_entities(Mod.id)])
+
+def downgrade() -> None:
+    op.drop_index('ix_mod_similarity_main_mod_similarity', table_name='mod_similarity')
+    drop_table('mod_similarity')
diff --git a/templates/mod.html b/templates/mod.html
@@ -394,6 +394,21 @@ <h3>Export Raw Stats</h3>
         </div>
     </div>
 </div>
+{% if mod.similar_mods %}
+<div class="well">
+    <div class="container main-cat">
+        <h3>Similar-ish Mods</h3>
+    </div>
+</div>
+<div class="container">
+    <div class="row">
+        {% set similar_mods = mod.similar_mods[:6] -%}
+        {%- for mod in similar_mods -%}
+        {%- include "mod-box.html" -%}
+        {%- endfor %}
+    </div>
+</div>
+{% endif %}
 {% if editable %}
 <div class="modal fade" id="confirm-delete" tabindex="-1" role="dialog" aria-labelledby="confirm-delete" aria-hidden="true">
     <div class="modal-dialog">