-
-
Notifications
You must be signed in to change notification settings - Fork 33
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
9 changed files
with
315 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
from heapq import nlargest | ||
from typing import List | ||
|
||
from .objects import Mod, ModSimilarity | ||
|
||
|
||
def find_most_similar(mod: Mod, how_many: int = 6) -> List[ModSimilarity]: | ||
get_sim = lambda mod_sim: mod_sim.similarity | ||
return sorted(nlargest(how_many, | ||
# Zero similarity means nothing at all in common, so skip those | ||
filter(lambda mod_sim: mod_sim.similarity > 0, | ||
(ModSimilarity(mod, other_mod) | ||
for other_mod in | ||
Mod.query.filter(Mod.published, | ||
Mod.game_id == mod.game_id, | ||
Mod.id != mod.id))), | ||
key=get_sim), | ||
key=get_sim, | ||
reverse=True) | ||
|
||
|
||
def update_similar_mods(mod: Mod, how_many: int = 6) -> None: | ||
if not mod.published: | ||
mod.similarities = [] | ||
else: | ||
most_similar = find_most_similar(mod, how_many) | ||
# Remove rows for mods that are no longer among the most similar | ||
for mod_sim in mod.similarities: | ||
if not any(mod_sim.other_mod_id == other_sim.other_mod_id | ||
for other_sim in most_similar): | ||
ModSimilarity.query\ | ||
.filter(ModSimilarity.main_mod_id == mod_sim.main_mod_id, | ||
ModSimilarity.other_mod_id == mod_sim.other_mod_id)\ | ||
.delete() | ||
for mod_sim in most_similar: | ||
match = [other_sim for other_sim in mod.similarities | ||
if mod_sim.other_mod_id == other_sim.other_mod_id] | ||
if match: | ||
# Update existing rows for mods that are still similar | ||
match[0].similarity = mod_sim.similarity | ||
# Update the row with swapped IDs, if any | ||
for other_sim in match[0].other_mod.similarities: | ||
if other_sim.other_mod_id == mod_sim.main_mod_id: | ||
other_sim.similarity = mod_sim.similarity | ||
else: | ||
# Add new rows for newly similar mods | ||
mod.similarities.append(mod_sim) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
import re | ||
from typing import Set, Iterable | ||
|
||
|
||
# Split words on one or more non-alphanumerics | ||
WORD_SPLIT = re.compile(r'[^a-zA-Z0-9]+') | ||
|
||
# Split up pieces of StudlyCapsStrings | ||
STUDLY_SPLIT = re.compile(r'(?=[A-Z])') | ||
|
||
# English words that do not convey meaning about the context | ||
# We care about things like "rocket" and "propellant" and "deltaV" | ||
MEANINGLESS = { | ||
'the', 'an', 'this', 'these', 'that', 'those', | ||
'and', 'or', 'but', 'however', | ||
'as', 'such', 'than', 'there', | ||
'me', 'my', 'we', 'us', 'our', | ||
'you', 'your', 'he', 'him', 'she', 'her', 'it', | ||
'they', 'them', | ||
'to', 'from', 'in', 'on', 'for', 'with', 'of', 'into', 'at', 'by', | ||
'what', 'because', 'then', | ||
'is', 'be', 'been', 'are', 'get', 'getting', 'has', 'have', 'come', | ||
'do', 'does', | ||
'will', 'make', 'work', 'also', 'more', | ||
'should', 'so', 'some', 'like', 'likely', 'can', 'seems', | ||
'really', 'very', 'each', 'yup', 'which', | ||
've', 're', | ||
'accommodate', 'manner', 'therefore', 'ever', 'probably', 'almost', | ||
'something', | ||
'mod', 'pack', 'contains', 'ksp', | ||
'http', 'https', 'www', 'youtube', 'imgur', 'com', | ||
'github', 'githubusercontent', | ||
'forum', 'kerbalspaceprogram', 'index', 'thread', 'topic', 'php', | ||
'kerbal', 'space', 'continued', 'revived', 'updated', 'redux', | ||
'inc', 'plus', | ||
} | ||
|
||
|
||
def split_with_acronyms(s: str) -> Iterable[str]: | ||
words = WORD_SPLIT.split(s) | ||
yield from words | ||
for w in words: | ||
yield from STUDLY_SPLIT.split(w) | ||
|
||
|
||
def meaningful_words(s: str) -> Set[str]: | ||
return set(map(lambda w: w.lower(), | ||
filter(lambda w: len(w) > 1 and not w.isnumeric(), | ||
split_with_acronyms(s)))) - MEANINGLESS | ||
|
||
|
||
def words_similarity(words1: Set[str], words2: Set[str]) -> float: | ||
in_both = words1.intersection(words2) | ||
all_words = words1 | words2 | ||
return len(in_both) / len(all_words) if all_words else 0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
"""Create mod_similarity table | ||
Revision ID: bbcce95b6e79 | ||
Revises: 3fb8a6e2e0a5 | ||
Create Date: 2021-12-16 05:06:06.312797 | ||
""" | ||
|
||
# revision identifiers, used by Alembic. | ||
revision = 'bbcce95b6e79' | ||
down_revision = '3fb8a6e2e0a5' | ||
|
||
from alembic import op | ||
from alembic.op import create_table, drop_table | ||
import sqlalchemy as sa | ||
|
||
from KerbalStuff.celery import update_mod_similarities | ||
|
||
Base = sa.ext.declarative.declarative_base() | ||
|
||
class Mod(Base): # type: ignore | ||
__tablename__ = 'mod' | ||
id = sa.Column(sa.Integer, primary_key=True) | ||
published = sa.Column(sa.Boolean, default=False) | ||
|
||
|
||
def upgrade() -> None: | ||
create_table('mod_similarity', | ||
sa.Column('main_mod_id', sa.Integer(), nullable=False), | ||
sa.Column('other_mod_id', sa.Integer(), nullable=False), | ||
sa.Column('similarity', sa.Float(precision=5), nullable=False), | ||
sa.ForeignKeyConstraint(['main_mod_id'], ['mod.id'], ondelete='CASCADE'), | ||
sa.ForeignKeyConstraint(['other_mod_id'], ['mod.id'], ondelete='CASCADE'), | ||
sa.PrimaryKeyConstraint('main_mod_id', 'other_mod_id', name='pk_mods')) | ||
op.create_index('ix_mod_similarity_main_mod_similarity', | ||
'mod_similarity', ['main_mod_id', sa.text('similarity DESC')], unique=False) | ||
|
||
# Ask Celery to build the similarity rows for existing published mods | ||
update_mod_similarities.delay([mod_id for mod_id, in | ||
sa.orm.Session(bind=op.get_bind()) | ||
.query(Mod) | ||
.filter(Mod.published) | ||
.with_entities(Mod.id)]) | ||
|
||
def downgrade() -> None: | ||
op.drop_index('ix_mod_similarity_main_mod_similarity', table_name='mod_similarity') | ||
drop_table('mod_similarity') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.