Skip to content

Commit

Permalink
Pipeline improvements: finalizing asset similarity
Browse files Browse the repository at this point in the history
  • Loading branch information
tiptr committed Dec 26, 2022
1 parent 16fb0fd commit ae91f7b
Show file tree
Hide file tree
Showing 7 changed files with 86 additions and 36 deletions.
4 changes: 4 additions & 0 deletions backend/api/python_endpoints/asset_endpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,7 @@ def add_keyword(asset_iri: str, keyword: str):
keyword (str): _description_
"""
ASSETS_DAO.add_keyword(asset_iri=asset_iri, keyword=keyword)


def get_keywords_set_for_asset(asset_iri: str):
return ASSETS_DAO.get_keywords_set_for_asset(asset_iri=asset_iri)
8 changes: 4 additions & 4 deletions backend/api/python_endpoints/file_endpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,10 +162,6 @@ def save_extracted_text(file_iri: str, text: str):
SUPPL_FILE_DAO.save_extracted_text(file_iri=file_iri, text=text)


def get_keywords_set_for_asset(asset_iri: str):
return SUPPL_FILE_DAO.get_keywords_set_for_asset(asset_iri=asset_iri)


def reset_dimension_clusters():
SUPPL_FILE_DAO.reset_dimension_clusters()

Expand All @@ -182,3 +178,7 @@ def add_file_to_dimension_cluster(file_iri: str, cluster_iri: str):
SUPPL_FILE_DAO.add_file_to_dimension_cluster(
file_iri=file_iri, cluster_iri=cluster_iri
)


def get_dimensions_cluster_for_asset(asset_iri: str):
return SUPPL_FILE_DAO.get_dimensions_cluster_for_asset(asset_iri=asset_iri)
35 changes: 35 additions & 0 deletions backend/knowledge_graph/dao/AssetNodesDao.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,3 +155,38 @@ def add_keyword(self, asset_iri: str, keyword: str):
)

self.ps.graph_create(relationship)

def get_keywords_set_for_asset(self, asset_iri: str):
# File keywords
file_keywords_table = self.ps.graph_run(
"MATCH p=(a:"
+ NodeTypes.ASSET.value
+ ' {iri: "'
+ asset_iri
+ '"})-[r1:'
+ RelationshipTypes.HAS_SUPPLEMENTARY_FILE.value
+ "]->(t)-[r2:"
+ RelationshipTypes.KEYWORD_EXTRACTION.value
+ "]->(c) RETURN c.keyword"
).to_table()

file_keyword_list = [keyword[0] for keyword in file_keywords_table]

# Asset keywords
asset_keywords_table = self.ps.graph_run(
"MATCH p=(a:"
+ NodeTypes.ASSET.value
+ ' {iri: "'
+ asset_iri
+ '"})-[r1:'
+ RelationshipTypes.KEYWORD_EXTRACTION.value
+ "]->(c) RETURN c.keyword"
).to_table()

asset_keyword_list = [keyword[0] for keyword in asset_keywords_table]

# Combine
keyword_list = file_keyword_list
keyword_list.extend(asset_keyword_list)

return set(keyword_list)
33 changes: 16 additions & 17 deletions backend/knowledge_graph/dao/SupplementaryFileNodesDao.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,23 +206,6 @@ def save_extracted_text(self, file_iri: str, text: str):
node.update(extracted_text=text)
self.ps.graph_push(node)

def get_keywords_set_for_asset(self, asset_iri: str):
keywords_table = self.ps.graph_run(
"MATCH p=(a:"
+ NodeTypes.ASSET.value
+ ' {iri: "'
+ asset_iri
+ '"})-[r1:'
+ RelationshipTypes.HAS_SUPPLEMENTARY_FILE.value
+ "]->(t)-[r2:"
+ RelationshipTypes.KEYWORD_EXTRACTION.value
+ "]->(c) RETURN c.keyword"
).to_table()

keyword_list = [keyword[0] for keyword in keywords_table]

return set(keyword_list)

def reset_dimension_clusters(self):
self.ps.graph_run(
f"MATCH (n:{NodeTypes.DIMENSION_CLUSTER.value}) DETACH DELETE n"
Expand Down Expand Up @@ -252,3 +235,19 @@ def add_file_to_dimension_cluster(self, file_iri: str, cluster_iri: str):
)

self.ps.graph_create(relationship)

def get_dimensions_cluster_for_asset(self, asset_iri: str):

cluster_matches = self.ps.repo_match(model=DimensionClusterNode).where(
"(:"
+ NodeTypes.ASSET.value
+ ' {iri: "'
+ asset_iri
+ '"})-[:'
+ RelationshipTypes.HAS_SUPPLEMENTARY_FILE.value
+ "]->()-[:"
+ RelationshipTypes.PART_OF_DIMENSION_CLUSTER.value
+ "]->(_)"
)

return cluster_matches.first()
6 changes: 3 additions & 3 deletions similarity_pipeline/similarity_pipeline_5_cad_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,9 +123,9 @@ def similarity_pipeline_5_cad_analysis():
max_z = max(point[stl.Dimension.Z], max_z)
min_z = min(point[stl.Dimension.Z], min_z)

length = max_x - min_x
width = max_y - min_y
height = max_z - min_z
width = max_x - min_x
height = max_y - min_y
length = max_z - min_z

properties_dict = {
"length": float(length),
Expand Down
6 changes: 3 additions & 3 deletions similarity_pipeline/similarity_pipeline_6_image_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,8 @@ def similarity_pipeline_6_image_analysis():
tmp_file_path = "./temporary_picture.jpg"

with io.FileIO(tmp_file_path, "w") as tmp_file:
for pdf_line in file_stream:
tmp_file.write(pdf_line)
for file_line in file_stream:
tmp_file.write(file_line)

logger.info("Detecting objects...")

Expand Down Expand Up @@ -108,7 +108,7 @@ def similarity_pipeline_6_image_analysis():
logger.info("Extracting text from PDF...")
text_encoded = textract.process(
tmp_file_path,
method="tesseract",
method="tesseract-ocr",
# language="eng",
)
text = str(text_encoded, "UTF-8")
Expand Down
30 changes: 21 additions & 9 deletions similarity_pipeline/similarity_pipeline_7_asset_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
from numpy import nan
from numpy import linalg as linalg
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA

from backend.api.python_endpoints import asset_endpoints
from backend.api.python_endpoints import timeseries_endpoints
Expand All @@ -16,7 +14,9 @@
TimeseriesNodeFlat,
TimeseriesValueTypes,
)
from similarity_pipeline.similarity_pipeline_status_manager import SimilarityPipelineStatusManager
from similarity_pipeline.similarity_pipeline_status_manager import (
SimilarityPipelineStatusManager,
)
from util.log import logger

# #############################################################################
Expand All @@ -33,7 +33,9 @@ def similarity_pipeline_7_asset_similarity():
logger.info("Loading assets and related nodes...")

# get assets flat (just for iris)
asset_nodes_flat: List[TimeseriesNodeFlat] = asset_endpoints.get_asset_nodes(deep=False)
asset_nodes_flat: List[TimeseriesNodeFlat] = asset_endpoints.get_asset_nodes(
deep=False
)

# per asset: get connected ts-clusters, keywords... (seperated, so they can be easily weighted differently)
timeseries_clusters = []
Expand All @@ -44,9 +46,17 @@ def similarity_pipeline_7_asset_similarity():

keyword_sets = []
for asset_node in asset_nodes_flat:
keyword_sets.append(
file_endpoints.get_keywords_set_for_asset(asset_iri=asset_node.iri)
keyword_set = asset_endpoints.get_keywords_set_for_asset(
asset_iri=asset_node.iri
)
# Handle dimension clusters like key-phrases, as they can only occur once per asset
cluster = file_endpoints.get_dimensions_cluster_for_asset(
asset_iri=asset_node.iri
)
if cluster is not None:
keyword_set.add(cluster.iri)

keyword_sets.append(keyword_set)

################################################
logger.info("Calculating similarity scores...")
Expand All @@ -70,7 +80,8 @@ def similarity_pipeline_7_asset_similarity():

cosine_similarity = round(
number=float(
np.dot(vector1, vector2) / (linalg.norm(vector1) * linalg.norm(vector2))
np.dot(vector1, vector2)
/ (linalg.norm(vector1) * linalg.norm(vector2))
),
ndigits=4,
)
Expand Down Expand Up @@ -98,8 +109,9 @@ def similarity_pipeline_7_asset_similarity():
similarity_score=combined_similarity,
)


################################################
# logger.info("Building clusters...") TODO

SimilarityPipelineStatusManager.instance().set_active(active=False, stage="asset_similarity")
SimilarityPipelineStatusManager.instance().set_active(
active=False, stage="asset_similarity"
)

0 comments on commit ae91f7b

Please sign in to comment.