Skip to content

Commit

Permalink
SFR-2292: Skip clustering records with no title
Browse files Browse the repository at this point in the history
  • Loading branch information
kylevillegas93 committed Oct 29, 2024
1 parent d1b9e2e commit 074b909
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 2 deletions.
2 changes: 1 addition & 1 deletion main.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def createArgParser():
parser.add_argument('-o', '--offset',
help='Set start offset for current processed (for batched import process)')
parser.add_argument('-r', '--singleRecord',
help='Single record ID for ingesting an individual record (only applicable for DOAB)')
help='Single record ID for ingesting an individual record')
parser.add_argument('options', nargs='*', help='Additional arguments')

return parser
Expand Down
12 changes: 11 additions & 1 deletion processes/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ def runProcess(self):
self.cluster_records(full=True)
elif self.process == 'custom':
self.cluster_records(start_datetime=self.ingestPeriod)
elif self.process == 'single':
self.cluster_records(record_uuid=self.singleRecord)
else:
logger.warning(f'Unknown cluster process type {self.process}')
except Exception as e:
Expand All @@ -48,7 +50,7 @@ def runProcess(self):
finally:
self.close_connection()

def cluster_records(self, full=False, start_datetime=None):
def cluster_records(self, full=False, start_datetime=None, record_uuid=None):
get_unclustered_records_query = (
self.session.query(Record)
.filter(Record.frbr_status == 'complete')
Expand All @@ -65,6 +67,9 @@ def cluster_records(self, full=False, start_datetime=None):

get_unclustered_records_query = get_unclustered_records_query.filter(Record.date_modified > start_datetime)

if record_uuid:
get_unclustered_records_query = get_unclustered_records_query.filter(Record.uuid == record_uuid)

works_to_index = []
work_ids_to_delete = set()

Expand Down Expand Up @@ -167,6 +172,11 @@ def find_all_matching_records(self, record: Record):

for matched_record in matched_records:
matched_record_title, matched_record_id, matched_record_identifiers = matched_record

if not matched_record_title:
logger.warning(f'Matched record with id {matched_record_id} has no title')
continue

tokenized_matched_record_title = self.tokenize_title(matched_record_title)

if match_distance > 0 and not self.titles_overlap(tokenized_record_title, tokenized_matched_record_title):
Expand Down

0 comments on commit 074b909

Please sign in to comment.