CornellNLP · seanzhangkx8 · Dec 28, 2024 · Dec 17, 2024 · Dec 23, 2024 · Dec 24, 2024
diff --git a/convokit/__init__.py b/convokit/__init__.py
@@ -1,22 +1,31 @@
-from .model import *
-from .util import *
-from .coordination import *
-from .politenessStrategies import *
-from .transformer import *
-from .convokitPipeline import *
-from .hyperconvo import *
-from .speakerConvoDiversity import *
-from .text_processing import *
-from .phrasing_motifs import *
-from .prompt_types import *
-from .classifier import *
-from .ranker import *
-from .forecaster import *
-from .fighting_words import *
-from .paired_prediction import *
-from .bag_of_words import *
-from .expected_context_framework import *
-from .surprise import *
-from .convokitConfig import *
+import warnings
+
+try:
+    from .model import *
+    from .util import *
+    from .coordination import *
+    from .politenessStrategies import *
+    from .transformer import *
+    from .convokitPipeline import *
+    from .hyperconvo import *
+    from .speakerConvoDiversity import *
+    from .text_processing import *
+    from .phrasing_motifs import *
+    from .prompt_types import *
+    from .classifier import *
+    from .ranker import *
+    from .forecaster import *
+    from .fighting_words import *
+    from .paired_prediction import *
+    from .bag_of_words import *
+    from .expected_context_framework import *
+    from .surprise import *
+    from .convokitConfig import *
+except Exception as e:
+    print(f"An error occurred: {e}")
+    warnings.warn(
+        "If you are using ConvoKit with Google Colab, incorrect versions of some packages (ex. scipy) may be imported while runtime start. To fix the issue, restart the session and run all codes again. Thank you!"
+    )
+
 
 # __path__ = __import__('pkgutil').extend_path(__path__, __name__)
diff --git a/convokit/model/conversation.py b/convokit/model/conversation.py
@@ -19,8 +19,7 @@ class Conversation(CorpusComponent):
     :param meta: Table of initial values for conversation-level metadata
 
     :ivar id: the ID of the Conversation
-    :ivar meta: A dictionary-like view object providing read-write access to
-        conversation-level metadata.
+    :ivar meta: A dictionary-like view object providing read-write access to conversation-level metadata.
     """
 
     def __init__(
@@ -67,9 +66,8 @@ def iter_utterances(
         """
         Get utterances in the Corpus, with an optional selector that filters for Utterances that should be included.
 
-        :param selector: a (lambda) function that takes an Utterance and returns True or False (i.e. include / exclude).
-                        By default, the selector includes all Utterances in the Conversation.
-                :return: a generator of Utterances
+        :param selector: a (lambda) function that takes an Utterance and returns True or False (i.e. include / exclude). By default, the selector includes all Utterances in the Conversation.
+        :return: a generator of Utterances
         """
         for ut_id in self._utterance_ids:
             utt = self._owner.get_utterance(ut_id)

diff --git a/convokit/model/corpusComponent.py b/convokit/model/corpusComponent.py
@@ -129,6 +129,7 @@ def set_data(self, property_name, value):
     def retrieve_meta(self, key: str):
         """
         Retrieves a value stored under the key of the metadata of corpus object
+
         :param key: name of metadata attribute
         :return: value
         """
@@ -137,6 +138,7 @@ def retrieve_meta(self, key: str):
     def add_meta(self, key: str, value) -> None:
         """
         Adds a key-value pair to the metadata of the corpus object
+
         :param key: name of metadata attribute
         :param value: value of metadata attribute
         :return: None
@@ -148,11 +150,10 @@ def get_vector(
     ):
         """
         Get the vector stored as `vector_name` for this object.
+
         :param vector_name: name of vector
-        :param as_dataframe: whether to return the vector as a dataframe (True) or in its raw array form (False). False
-            by default.
-        :param columns: optional list of named columns of the vector to include. All columns returned otherwise. This
-            parameter is only used if as_dataframe is set to True
+        :param as_dataframe: whether to return the vector as a dataframe (True) or in its raw array form (False). False by default.
+        :param columns: optional list of named columns of the vector to include. All columns returned otherwise. This parameter is only used if as_dataframe is set to True
         :return: a numpy / scipy array
         """
         if vector_name not in self.vectors:
@@ -166,10 +167,10 @@ def get_vector(
 
     def add_vector(self, vector_name: str):
         """
-        Logs in the Corpus component object's internal vectors list that the component object has a vector row
-        associated with it in the vector matrix named `vector_name`.
-        Transformers that add vectors to the Corpus should use this to update the relevant component objects during
-        the transform() step.
+        Logs in the Corpus component object's internal vectors list that the component object has a vector row associated with it in the vector matrix named `vector_name`.
+
+        Transformers that add vectors to the Corpus should use this to update the relevant component objects during the transform() step.
+
         :param vector_name: name of vector matrix
         :return: None
         """
@@ -182,6 +183,7 @@ def has_vector(self, vector_name: str):
     def delete_vector(self, vector_name: str):
         """
         Delete a vector associated with this Corpus component object.
+
         :param vector_name:
         :return: None
         """

diff --git a/convokit/prompt_types/promptTypes.py b/convokit/prompt_types/promptTypes.py
@@ -127,11 +127,8 @@ def fit(
         Fits a PromptTypes model for a corpus -- that is, learns latent representations of prompt and response terms, as well as prompt types.
 
         :param corpus: Corpus
-        :param prompt_selector: a boolean function of signature `filter(utterance)` that determines which
-        utterances will be considered as prompts in the fit step. defaults to using all utterances which have a response.
-        :param reference_selector: a boolean function of signature `filter(utterance)` that determines which utterances
-            will be considered as responses in the fit step. defaults to using all utterances which are responses to a
-            prompt.
+        :param prompt_selector: a boolean function of signature `filter(utterance)` that determines which utterances will be considered as prompts in the fit step. defaults to using all utterances which have a response.
+        :param reference_selector: a boolean function of signature `filter(utterance)` that determines which utterances will be considered as responses in the fit step. defaults to using all utterances which are responses to a prompt.
 
         :return: None
         """

diff --git a/docs/source/data_format.rst b/docs/source/data_format.rst
@@ -56,6 +56,7 @@ utterances.jsonl contains a list of such utterances. An example utterance is sho
 ::
 
  {'id': '200', 'speaker': 'mr. srinivasan', 'conversation_id': '145', 'reply_to': '199', 'timestamp': None, 'text': 'It -- it does.', 'meta': {'case': '02-1472', 'side': 'respondent'}}
+
 ::
 
 
@@ -121,6 +122,7 @@ As an example, the corpus-level metadata for the Reddit corpus (small) is shown
 ::
 
 "overall-index": {"subreddit": "<class 'str'>", "num_posts": "<class 'int'>", "num_comments": "<class 'int'>", "num_speakers": "<class 'int'>"}
+
 :: 
 
 

diff --git a/docs/source/supreme.rst b/docs/source/supreme.rst
@@ -1,5 +1,5 @@
 Supreme Court Oral Arguments Corpus
-==============================
+=======================================
 
 
 A collection of cases from the U.S. Supreme Court, along with transcripts of oral arguments. Contains approximately 1,700,000 utterances over 8,000 oral arguments transcripts from 7,700 cases.
@@ -14,7 +14,7 @@ The following examples use this corpus:
 * `computing linguistic coordination <https://github.com/CornellNLP/ConvoKit/blob/master/examples/coordination/examples.ipynb>`_
 
 Some considerations regarding case and voting information
--------------------------------------------------
+---------------------------------------------------------------
 
 Each case in the data can have multiple conversations, corresponding to multiple sessions of oral arguments heard. For convenience, we include information for each conversation about how justices voted in the  corresponding *case*, meaning that vote information will be repeated across each conversation corresponding to a case. The case metadata file also lists vote information.
 
@@ -126,7 +126,7 @@ Case information
 * transcripts: a list of transcript names, URLs and IDs (corresponding to the IDs of conversations in the corpus). 
 
 Citation and other versions
-^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 This corpus extends a `smaller dataset <https://confluence.cornell.edu/display/llresearch/Supreme+Court+Dialogs+Corpus>`_ of oral arguments that we previously released together with `Echoes of power\: Language effects and power differences in social interaction <https://www.cs.cornell.edu/~cristian/Echoes_of_power.html>`_. Cristian Danescu-Niculescu-Mizil, Bo Pang, Lillian Lee and Jon Kleinberg. WWW 2012.  Please cite the Echoes of Powers paper if you use either version of the corpus.  If you use the ConvoKit version 	please additionally cite: `ConvoKit\: A Toolkit for the Analysis of Conversations <https://www.cs.cornell.edu/~cristian/ConvoKit_Demo_Paper_files/convokit-demo-paper.pdf>`_. Jonathan P. Chang, Caleb Chiam, Liye Fu, Andrew Wang, Justine Zhang, Cristian Danescu-Niculescu-Mizil. Proceedings of SIGDIAL. 2020.
 

diff --git a/docs/source/troubleshooting.rst b/docs/source/troubleshooting.rst
@@ -10,6 +10,14 @@ General checks
 Issues
 ^^^^^^
 
+**Google Colab User Note**
+
+Running ConvoKit with Google Colab can trigger error with scipy package, which is likely due to Colab runtime preloading an older version that 
+is not compatible with other packages. When install ConvoKit in Colab environment, if the error occurs, a user warning message should display. 
+The error can be easily fixed by restarting the Colab runtime session and run the cells again.
+
+-----------------------------
+
 **Error Associated with Numpy 2.0.0**
 
 The release of `numpy 2.0.0 <https://numpy.org/devdocs/release/2.0.0-notes.html>`_ is exciting,

diff --git a/examples/Introduction_to_ConvoKit.ipynb b/examples/Introduction_to_ConvoKit.ipynb
diff --git a/examples/conversations-gone-awry/Conversations_Gone_Awry_Prediction.ipynb b/examples/conversations-gone-awry/Conversations_Gone_Awry_Prediction.ipynb
diff --git a/examples/hyperconvo/hyperconvo_demo.ipynb b/examples/hyperconvo/hyperconvo_demo.ipynb
diff --git a/examples/hyperconvo/predictive_tasks.ipynb b/examples/hyperconvo/predictive_tasks.ipynb
@@ -86,7 +86,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "threads_corpus = corpus.reindex_conversations(new_convo_roots=top_level_utterance_ids, \n",
+    "threads_corpus = Corpus.reindex_conversations(source_corpus=corpus, \n",
+    "                                              new_convo_roots=top_level_utterance_ids, \n",
     "                                              preserve_convo_meta=True,\n",
     "                                              preserve_corpus_meta=False)"
    ]
@@ -226,7 +227,7 @@
    "source": [
     "## volume is the number of unique users in the first 10 comments\n",
     "for convo in threads_corpus.iter_conversations():\n",
-    "    convo.meta['volume'] = len(set([utt.user for utt in convo.get_chronological_utterance_list()[:10]]))"
+    "    convo.meta['volume'] = len(set([utt.speaker for utt in convo.get_chronological_utterance_list()[:10]]))"
    ]
   },
   {

diff --git a/examples/merging/corpus_merge_demo.ipynb b/examples/merging/corpus_merge_demo.ipynb
@@ -124,7 +124,7 @@
     }
    ],
    "source": [
-    "corpus3 = corpus1.merge(corpus2)"
+    "corpus3 = Corpus.merge(corpus1, corpus2)"
    ]
   },
   {
@@ -325,7 +325,7 @@
     }
    ],
    "source": [
-    "corpus6 = corpus4.merge(corpus5)"
+    "corpus6 = Corpus.merge(corpus4, corpus5)"
    ]
   },
   {

diff --git a/examples/politeness-strategies/Politeness_Strategies_in_MT-mediated_Communication.ipynb b/examples/politeness-strategies/Politeness_Strategies_in_MT-mediated_Communication.ipynb
diff --git a/examples/text-processing/text_preprocessing_demo.ipynb b/examples/text-processing/text_preprocessing_demo.ipynb
@@ -786,7 +786,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "adhoc_utt = prep.transform_utterance(adhoc_utt)"
+    "adhoc_utt = prep.transform_utterance(corpus.random_utterance())"
    ]
   },
   {

diff --git a/examples/vectors/bag-of-words-demo.ipynb b/examples/vectors/bag-of-words-demo.ipynb
@@ -794,7 +794,7 @@
     }
    ],
    "source": [
-    "threads_corpus = corpus.reindex_conversations(new_convo_roots=top_level_comment_ids)"
+    "threads_corpus = corpus.reindex_conversations(corpus, new_convo_roots=top_level_comment_ids)"
    ]
   },
   {

diff --git a/setup.py b/setup.py
@@ -42,6 +42,7 @@
     },
     install_requires=[
         "matplotlib>=3.0.0",
+        "scipy>1.14",
         "pandas>=1.5.0",
         "numpy>=2.0.0",
         "msgpack-numpy>=0.4.3.2",
-Original file line number
+Diff line change
@@ Expand Up @@
     ::
      {'id': '200', 'speaker': 'mr. srinivasan', 'conversation_id': '145', 'reply_to': '199', 'timestamp': None, 'text': 'It -- it does.', 'meta': {'case': '02-1472', 'side': 'respondent'}}
     ::
@@ Expand Down Expand Up @@
     ::
     "overall-index": {"subreddit": "<class 'str'>", "num_posts": "<class 'int'>", "num_comments": "<class 'int'>", "num_speakers": "<class 'int'>"}
     ::
@@ Expand Down @@