From 6c2101c2b79418dca5e275aef2f8899dc0c96def Mon Sep 17 00:00:00 2001 From: siqiwu Date: Wed, 13 Mar 2024 11:36:37 -0400 Subject: [PATCH] adding comments to process_data and code refactor --- sourcecode/scoring/process_data.py | 168 +++++++++++++++++++++++------ 1 file changed, 134 insertions(+), 34 deletions(-) diff --git a/sourcecode/scoring/process_data.py b/sourcecode/scoring/process_data.py index dca467fd..a24ba284 100644 --- a/sourcecode/scoring/process_data.py +++ b/sourcecode/scoring/process_data.py @@ -173,6 +173,22 @@ def read_from_tsv( return notes, ratings, noteStatusHistory, userEnrollment +def get_unique_size(df: pd.DataFrame, k: str, rows: pd.Index = None) -> int: + """Return the number of unique values in a column `k` of a DataFrame `df` at `rows`. + + Args: + df (pd.DataFrame): DataFrame + k (str): column name + rows (pd.Index, optional): rows to consider. Defaults to None. + + Returns: + int: number of unique values in the column + """ + if rows is not None: + df = df.loc[rows] + return len(np.unique(df[k])) + + def _filter_misleading_notes( notes: pd.DataFrame, ratings: pd.DataFrame, @@ -182,9 +198,14 @@ def _filter_misleading_notes( """ This function actually filters ratings (not notes), based on which notes they rate. - Filter out ratings of notes that say the Tweet isn't misleading. - Also filter out ratings of deleted notes, unless they were deleted after - c.deletedNotesTombstoneLaunchTime, and appear in noteStatusHistory. + For deleted notes (c.classificationKey is NaN): + - Keep ratings of notes that appear in noteStatusHistory (previously scored) + - Remove ratings of notes that do not appear in noteStatusHistory + For still available notes (c.classificationKey is either MISINFORMED_OR_POTENTIALLY_MISLEADING or NOT_MISLEADING): + - Keep ratings of notes saying the associated tweet is misleading + - For those saying the associated tweet is not misleading: + - Keep ratings after the new UI launch time, c.notMisleadingUILaunchTime + - Remove ratings before the new UI launch time, c.notMisleadingUILaunchTime Args: notes (pd.DataFrame): _description_ @@ -202,54 +223,99 @@ def _filter_misleading_notes( suffixes=("", "_nsh"), ) - deletedNoteKey = "deletedNote" - notDeletedMisleadingKey = "notDeletedMisleading" - deletedButInNSHKey = "deletedButInNSH" createdAtMillisNSHKey = c.createdAtMillisKey + "_nsh" - ratings[deletedNoteKey] = pd.isna(ratings[c.classificationKey]) - ratings[notDeletedMisleadingKey] = np.invert(ratings[deletedNoteKey]) & ( - ratings[c.classificationKey] == c.notesSaysTweetIsMisleadingKey - ) - ratings[deletedButInNSHKey] = ratings[deletedNoteKey] & np.invert( - pd.isna(ratings[createdAtMillisNSHKey]) - ) + # rows in ratings that are on deleted notes, check if the note is in noteStatusHistory + deletedNote = pd.isna(ratings[c.classificationKey]) - deletedNotInNSH = (ratings[deletedNoteKey]) & pd.isna(ratings[createdAtMillisNSHKey]) - notDeletedNotMisleadingOldUI = ( - ratings[c.classificationKey] == c.noteSaysTweetIsNotMisleadingKey - ) & (ratings[createdAtMillisNSHKey] <= c.notMisleadingUILaunchTime) - notDeletedNotMisleadingNewUI = ( - ratings[c.classificationKey] == c.noteSaysTweetIsNotMisleadingKey - ) & (ratings[createdAtMillisNSHKey] > c.notMisleadingUILaunchTime) + # deleted but in noteStatusHistory, keep + deletedButInNSHNote = deletedNote & pd.notna(ratings[createdAtMillisNSHKey]) + # deleted and not in noteStatusHistory, remove + deletedNotInNSHNote = deletedNote & pd.isna(ratings[createdAtMillisNSHKey]) + + # rows in ratings that are on still available notes, check if the note says the tweet is misleading or not + availableNote = pd.notna(ratings[c.classificationKey]) + + # not deleted and says the tweet is misleading, keep + notDeletedMisleadingNote = ratings[c.classificationKey] == c.notesSaysTweetIsMisleadingKey + + # not deleted and says the tweet is not misleading, check if it's after or before the new UI launch time + notDeletedNotMisleadingNote = ratings[c.classificationKey] == c.noteSaysTweetIsNotMisleadingKey + + # not deleted, says the tweet is not misleading, and after new UI launch time, keep + notDeletedNotMisleadingNewUINote = (ratings[c.classificationKey] == c.noteSaysTweetIsNotMisleadingKey) & (ratings[createdAtMillisNSHKey] > c.notMisleadingUILaunchTime) + # not deleted, says the tweet is not misleading, and before new UI launch time, remove + notDeletedNotMisleadingOldUINote = (ratings[c.classificationKey] == c.noteSaysTweetIsNotMisleadingKey) & (ratings[createdAtMillisNSHKey] <= c.notMisleadingUILaunchTime) if logging: print( - f"Preprocess Data: Filter misleading notes, starting with {len(ratings)} ratings on {len(np.unique(ratings[c.noteIdKey]))} notes" + f"Finished filtering misleading notes\n" + f"Preprocess Data: Filter misleading notes, starting with {len(ratings)} ratings on {get_unique_size(ratings, c.noteIdKey)} notes" + ) + print( + f"For {deletedNote.sum()} ratings on {get_unique_size(ratings, c.noteIdKey, rows=deletedNote)} deleted notes" + ) + print( + f" Keep {deletedButInNSHNote.sum()} ratings on {get_unique_size(ratings, c.noteIdKey, rows=deletedButInNSHNote)} deleted notes that are in noteStatusHistory (e.g., previously scored)" + ) + print( + f" Remove {deletedNotInNSHNote.sum()} ratings on {get_unique_size(ratings, c.noteIdKey, rows=deletedNotInNSHNote)} deleted notes that are not in noteStatusHistory (e.g., old)" + ) + print( + f"For {availableNote.sum()} ratings on {get_unique_size(ratings, c.noteIdKey, rows=availableNote)} still available notes" ) print( - f" Keeping {ratings[notDeletedMisleadingKey].sum()} ratings on {len(np.unique(ratings.loc[ratings[notDeletedMisleadingKey],c.noteIdKey]))} misleading notes" + f" Keep {notDeletedMisleadingNote.sum()} ratings on {get_unique_size(ratings, c.noteIdKey, rows=notDeletedMisleadingNote)} available notes saying the associated tweet is misleading" ) print( - f" Keeping {ratings[deletedButInNSHKey].sum()} ratings on {len(np.unique(ratings.loc[ratings[deletedButInNSHKey],c.noteIdKey]))} deleted notes that were previously scored (in note status history)" + f" For {notDeletedNotMisleadingNote.sum()} ratings on {get_unique_size(ratings, c.noteIdKey, rows=notDeletedNotMisleadingNote)} available notes saying the associated tweet is not misleading" ) print( - f" Removing {notDeletedNotMisleadingOldUI.sum()} ratings on {len(np.unique(ratings.loc[notDeletedNotMisleadingOldUI, c.noteIdKey]))} older notes that aren't deleted, but are not-misleading." + f" Keep {notDeletedNotMisleadingNewUINote.sum()} ratings on {get_unique_size(ratings, c.noteIdKey, rows=notDeletedNotMisleadingNewUINote)} available and not misleading notes, and after the new UI launch time" ) print( - f" Removing {deletedNotInNSH.sum()} ratings on {len(np.unique(ratings.loc[deletedNotInNSH, c.noteIdKey]))} notes that were deleted and not in note status history (e.g. old)." + f" Remove {notDeletedNotMisleadingOldUINote.sum()} ratings on {get_unique_size(ratings, c.noteIdKey, rows=notDeletedNotMisleadingOldUINote)} available and not misleading notes, and before the new UI launch time" ) + + # Validate expectation that all notes with ratings are either deleted or not deleted + assert len(ratings) == ( + deletedNote.sum() + availableNote.sum() + ), "rows of ratings must equal to the sum of ratings on deleted notes and ratings on available notes" + assert get_unique_size(ratings, c.noteIdKey) == ( + get_unique_size(ratings, c.noteIdKey, rows=deletedNote) + get_unique_size(ratings, c.noteIdKey, rows=availableNote) + ), "rows of notes must equal to the sum of deleted notes and available notes" + + # Validate expectation that all deleted notes must be either in noteStatusHistory or not in noteStatusHistory + assert deletedNote.sum() == ( + deletedButInNSHNote.sum() + deletedNotInNSHNote.sum() + ), "all ratings on deleted notes must be either in noteStatusHistory or not in noteStatusHistory" + assert get_unique_size(ratings, c.noteIdKey, rows=deletedNote) == ( + get_unique_size(ratings, c.noteIdKey, rows=deletedButInNSHNote) + get_unique_size(ratings, c.noteIdKey, rows=deletedNotInNSHNote) + ), "all deleted notes must be either in noteStatusHistory or not in noteStatusHistory" + + # Validate expectation that all available notes must either say Tweet Is Misleading or Tweet Is Not Misleading + assert availableNote.sum() == ( + notDeletedMisleadingNote.sum() + notDeletedNotMisleadingNote.sum() + ), "all ratings on available notes must either say Tweet Is Misleading or Tweet Is Not Misleading" + assert get_unique_size(ratings, c.noteIdKey, rows=availableNote) == ( + get_unique_size(ratings, c.noteIdKey, rows=notDeletedMisleadingNote) + get_unique_size(ratings, c.noteIdKey, rows=notDeletedNotMisleadingNote) + ), "all available notes must either say Tweet Is Misleading or Tweet Is Not Misleading" + + # Validate expectation that all available and not misleading notes must be either after or before the new UI launch time + assert notDeletedNotMisleadingNote.sum() == ( + notDeletedNotMisleadingNewUINote.sum() + notDeletedNotMisleadingOldUINote.sum() + ), "all ratings on available and not misleading notes must be either after or before the new UI launch time" + assert get_unique_size(ratings, c.noteIdKey, rows=notDeletedNotMisleadingNote) == ( + get_unique_size(ratings, c.noteIdKey, rows=notDeletedNotMisleadingNewUINote) + get_unique_size(ratings, c.noteIdKey, rows=notDeletedNotMisleadingOldUINote) + ), "all available and not misleading notes must be either after or before the new UI launch time" ratings = ratings[ - ratings[notDeletedMisleadingKey] | ratings[deletedButInNSHKey] | notDeletedNotMisleadingNewUI + deletedButInNSHNote | notDeletedMisleadingNote | notDeletedNotMisleadingNewUINote ] ratings = ratings.drop( columns=[ createdAtMillisNSHKey, c.classificationKey, - deletedNoteKey, - notDeletedMisleadingKey, - deletedButInNSHKey, ] ) return ratings @@ -303,11 +369,12 @@ def preprocess_data( shouldFilterNotMisleadingNotes: bool = True, logging: bool = True, ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: - """Populate helpfulNumKey, a unified column that merges the helpfulness answers from + """Populate helpfulNumKey, a unified column that merges the helpfulness answers from the V1 and V2 rating forms together, as described in https://twitter.github.io/communitynotes/ranking-notes/#helpful-rating-mapping. - Also, filter notes that indicate the Tweet is misleading, if the flag is True. + Also, remove notes that indicate the associated tweet is not misleading, + if the `shouldFilterNotMisleadingNotes` flag is True. Args: notes (pd.DataFrame) @@ -330,19 +397,52 @@ def preprocess_data( "Timestamp of latest note in data: ", pd.to_datetime(notes[c.createdAtMillisKey], unit="ms").max(), ) + print( + f"Original row numbers from provided tsv files\n", + f" notes: {len(notes)}\n", + f" ratings: {len(ratings)}\n", + f" noteStatusHistory: {len(noteStatusHistory)}\n", + ) + + # each rating must have a unique (noteId, raterParticipantId) pair ratings = remove_duplicate_ratings(ratings) + # each note must have a unique noteId notes = remove_duplicate_notes(notes) + if logging: + print( + f"After removing duplicates, there are {len(notes)} notes and {len(ratings)} ratings from {get_unique_size(ratings, c.noteIdKey)} notes\n" + f" Thus, {len(notes) - get_unique_size(ratings, c.noteIdKey)} notes have no ratings yet, removed..." + ) + + # add a new column `helpfulNum` to `ratings` + # `helpfulNum` is a unified column that merges the helpfulness answers from the V1 and V2 rating forms together + # `helpfulNum` is a float, with 0.0 for not helpful, 0.5 for somewhat helpful, and 1.0 for helpful ratings.loc[:, c.helpfulNumKey] = np.nan ratings.loc[ratings[c.helpfulKey] == 1, c.helpfulNumKey] = 1 ratings.loc[ratings[c.notHelpfulKey] == 1, c.helpfulNumKey] = 0 ratings.loc[ratings[c.helpfulnessLevelKey] == c.notHelpfulValueTsv, c.helpfulNumKey] = 0 ratings.loc[ratings[c.helpfulnessLevelKey] == c.somewhatHelpfulValueTsv, c.helpfulNumKey] = 0.5 ratings.loc[ratings[c.helpfulnessLevelKey] == c.helpfulValueTsv, c.helpfulNumKey] = 1 + num_raw_ratings = len(ratings) ratings = ratings.loc[~pd.isna(ratings[c.helpfulNumKey])] + if logging: + print( + f"After populating helpfulNumKey, there are {len(ratings)} ratings from {get_unique_size(ratings, c.noteIdKey)} notes\n" + f" Thus, {num_raw_ratings - len(ratings)} ratings have no helpfulness labels (i.e., helpfulKey=0 and notHelpfulKey=0), removed..." + ) + notes[c.tweetIdKey] = notes[c.tweetIdKey].astype(str) + # merge `notes` with `noteStatusHistory` + # `noteStatusHistory` contains the status of all previously scored notes, including deleted ones + # `notes` contains currently available notes, including the new ones (from last release timestamp) but excluding deleted ones + # after the merge, `noteStatusHistory` will have a new column called `classification`, populated from `notes` dataframe + # `classification` is the status of the note, which can be one of the following: + # - MISINFORMED_OR_POTENTIALLY_MISLEADING + # - NOT_MISLEADING + # - NaN (if the note is deleted) noteStatusHistory = note_status_history.merge_note_info(noteStatusHistory, notes) if shouldFilterNotMisleadingNotes: @@ -350,11 +450,11 @@ def preprocess_data( if logging: print( - "Num Ratings: %d, Num Unique Notes Rated: %d, Num Unique Raters: %d" + "After data preprocess, Num Ratings: %d, Num Unique Notes Rated: %d, Num Unique Raters: %d\n" % ( len(ratings), - len(np.unique(ratings[c.noteIdKey])), - len(np.unique(ratings[c.raterParticipantIdKey])), + get_unique_size(ratings, c.noteIdKey), + get_unique_size(ratings, c.raterParticipantIdKey), ) ) return notes, ratings, noteStatusHistory