From 96df5f5be23395d16138ba60b96de48504d33e33 Mon Sep 17 00:00:00 2001
From: JamesX <james.xiao@mbzuai.ac.ae>
Date: Mon, 17 Oct 2022 17:12:46 +0400
Subject: [PATCH 1/7] Inital commit for the profiling test -- not using imports
 from forte.nltk or fortex.nltk to prevent version conflicts initally and use
 all-in-one approach to make sure the test code will be run the same way with
 0.2.0 and new version > 0.3.0.

---
 tests/forte/data/data_pack_profiling_test.py | 412 +++++++++++++++++++
 1 file changed, 412 insertions(+)
 create mode 100644 tests/forte/data/data_pack_profiling_test.py

diff --git a/tests/forte/data/data_pack_profiling_test.py b/tests/forte/data/data_pack_profiling_test.py
new file mode 100644
index 000000000..5e2a54415
--- /dev/null
+++ b/tests/forte/data/data_pack_profiling_test.py
@@ -0,0 +1,412 @@
+# Copyright 2019 The Forte Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Utils for unit tests.
+"""
+
+import os
+import re
+import unittest
+import nltk
+
+__all__ = [
+    "performance_test",
+]
+
+from typing import Any, Callable
+
+from typing import Optional, Dict, Set, List, Any, Iterator
+
+from forte.common.configuration import Config
+from forte.common.resources import Resources
+
+from forte.data.readers import OntonotesReader, DirPackReader, StringReader
+from forte.processors.writers import PackNameJsonPackWriter
+
+from ft.onto.base_ontology import EntityMention, Token, Sentence, Phrase
+from nltk import (  # type: ignore
+    pos_tag,
+    ne_chunk,
+    PunktSentenceTokenizer,
+    download,
+)
+
+from nltk.tokenize import sent_tokenize, word_tokenize
+
+from forte.processors.base import PackProcessor
+from forte.data.data_pack import DataPack
+
+from forte import Pipeline
+from nltk.tokenize.treebank import TreebankWordTokenizer
+
+# from fortex.spacy import SpacyProcessor
+
+
+class SentenceAndTokenProcessor(PackProcessor):
+    def __init__(self):
+        super().__init__()
+
+    def initialize(self, resources, configs):
+        super().initialize(resources, configs)
+
+    def process_tokens(self, sentences, input_pack: DataPack):
+        """Basic tokenization and post tagging of the sentence.
+        Args:
+            processors: List of processor names.
+            sentences: Generator object which yields sentences in document.
+            input_pack: input pack which needs to be modified.
+        Returns: A mapping from SpaCy token index to Forte Token.
+        """
+        tokens: [Token] = []
+
+        last_sentence_word_idx = 0
+        for s_idx, sentence in sentences:
+            Sentence(input_pack, s_idx, s_idx + len(sentence))
+
+            for word in sentence:
+                begin_pos_word = word.idx
+                end_pos_word = begin_pos_word + len(word.text)
+                token = Token(input_pack, begin_pos_word, end_pos_word)
+                tokens.append(token)
+
+        return tokens
+
+    def _process(self, input_pack: DataPack):
+        doc = input_pack.text
+
+        sentences = sent_tokenize(doc)
+
+        # tokens = process_tokens(sentences, input_pack)   # sentences, input_pack
+        tokens: [Token] = []
+
+        last_sentence_word_idx = 0
+        s_idx = 0
+        for sentence in sentences:
+            e_idx = s_idx + len(sentence)
+            Sentence(input_pack, s_idx, e_idx)
+
+            last_sentence_word_idx = s_idx
+            for word in word_tokenize(sentence):
+                begin_pos_word = last_sentence_word_idx
+                end_pos_word = begin_pos_word + len(word)
+                token = Token(input_pack, begin_pos_word, end_pos_word)
+                last_sentence_word_idx = end_pos_word + 1
+                tokens.append(token)
+
+            s_idx = e_idx + 1
+
+        return tokens
+
+    def record(self, record_meta: Dict[str, Set[str]]):
+        r"""Method to add output type record of current processor
+        to :attr:`forte.data.data_pack.Meta.record`. The processor produce
+        different types with different settings of `processors` in config.
+        Args:
+            record_meta: the field in the data pack for type record that need to
+                fill in for consistency checking.
+        """
+        record_meta["ft.onto.base_ontology.Sentence"] = set()
+        record_meta["ft.onto.base_ontology.Token"] = set()
+
+
+class ExampleNLTKPOSTagger(PackProcessor):
+    r"""A wrapper of NLTK pos tagger."""
+
+    def initialize(self, resources, configs):
+        super().initialize(resources, configs)
+        # download the NLTK average perceptron tagger
+        nltk.download("averaged_perceptron_tagger")
+
+    def _process(self, input_pack: DataPack):
+        # get a list of token data entries from `input_pack`
+        # using `DataPack.get()`` method
+
+        token_texts = [token.text for token in input_pack.get(Token)]
+
+        # use nltk pos tagging module to tag token texts
+        taggings = nltk.pos_tag(token_texts)
+
+        # assign nltk taggings to token attributes
+        for token, tag in zip(input_pack.get(Token), taggings):
+            token.pos = tag[1]
+
+            # token.pos = word.tag_
+
+            # token.lemma = word.lemma_
+
+            # Store the spacy token index to forte token mapping.
+            # indexed_tokens[word.i] = token
+
+        # return indexed_tokens
+
+    def record(record_meta: Dict[str, Set[str]]):
+        record_meta["ft.onto.base_ontology.Token"].add("pos")
+        record_meta["ft.onto.base_ontology.Token"].add("lemma")
+
+    def process_tokens(
+        processors, sentences, input_pack: DataPack
+    ) -> Dict[int, Token]:
+        """Basic tokenization and post tagging of the sentence.
+        Args:
+            processors: List of processor names.
+            sentences: Generator object which yields sentences in document.
+            input_pack: input pack which needs to be modified.
+        Returns: A mapping from SpaCy token index to Forte Token.
+        """
+        indexed_tokens: Dict[int, Token] = {}
+
+        for sentence in sentences:
+            Sentence(input_pack, sentence.start_char, sentence.end_char)
+
+            if "tokenize" in processors:
+                # Iterating through spaCy token objects
+                for word in sentence:
+                    begin_pos_word = word.idx
+                    end_pos_word = begin_pos_word + len(word.text)
+                    token = Token(input_pack, begin_pos_word, end_pos_word)
+
+                    if "pos" in processors:
+                        token.pos = word.tag_
+
+                    if "lemma" in processors:
+                        token.lemma = word.lemma_
+
+                    # Store the spacy token index to forte token mapping.
+                    indexed_tokens[word.i] = token
+        return indexed_tokens
+
+
+class NLTKNER(PackProcessor):
+    r"""A wrapper of NLTK NER."""
+
+    def initialize(self, resources: Resources, configs: Config):
+        super().initialize(resources, configs)
+        download("maxent_ne_chunker")
+        download("words")
+
+    def __init__(self):
+        super().__init__()
+        self.token_component = None
+
+    def _process(self, input_pack: DataPack):
+        for sentence in input_pack.get(Sentence):
+            token_entries = list(
+                input_pack.get(
+                    entry_type=Token,
+                    range_annotation=sentence,
+                    components=self.token_component,
+                )
+            )
+            tokens = [(token.text, token.pos) for token in token_entries]
+            ne_tree = ne_chunk(tokens)
+
+            index = 0
+            for chunk in ne_tree:
+                if hasattr(chunk, "label"):
+                    # For example:
+                    # chunk: Tree('GPE', [('New', 'NNP'), ('York', 'NNP')])
+                    begin_pos = token_entries[index].span.begin
+                    end_pos = token_entries[index + len(chunk) - 1].span.end
+                    entity = EntityMention(input_pack, begin_pos, end_pos)
+                    entity.ner_type = chunk.label()
+                    index += len(chunk)
+                else:
+                    # For example:
+                    # chunk: ('This', 'DT')
+                    index += 1
+
+    def record(self, record_meta: Dict[str, Set[str]]):
+        r"""Method to add output type record of `NLTKNER` which is
+        `ft.onto.base_ontology.EntityMention` with attribute `phrase_type`
+        to :attr:`forte.data.data_pack.Meta.record`.
+
+        Args:
+            record_meta: the field in the datapack for type record that need to
+                fill in for consistency checking.
+        """
+        record_meta["ft.onto.base_ontology.EntityMention"] = {"ner_type"}
+
+    def expected_types_and_attributes(self):
+        r"""Method to add expected type ft.onto.base_ontology.Token` with
+        attribute `pos` and `ft.onto.base_ontology.Sentence` which
+        would be checked before running the processor if
+        the pipeline is initialized with
+        `enforce_consistency=True` or
+        :meth:`~forte.pipeline.Pipeline.enforce_consistency` was enabled for
+        the pipeline.
+        """
+        return {
+            "ft.onto.base_ontology.Sentence": set(),
+            "ft.onto.base_ontology.Token": {"pos"},
+        }
+
+
+class NLTKWordTokenizer(PackProcessor):
+    r"""A wrapper of NLTK word tokenizer."""
+
+    def __init__(self):
+        super().__init__()
+        self.tokenizer = TreebankWordTokenizer()
+
+    def _process(self, input_pack: DataPack):
+        for begin, end in self.tokenizer.span_tokenize(input_pack.text):
+            Token(input_pack, begin, end)
+
+    def record(self, record_meta: Dict[str, Set[str]]):
+        r"""Method to add output type record of `NLTKWordTokenizer`, which is
+        `ft.onto.base_ontology.Token`,
+        to :attr:`forte.data.data_pack.Meta.record`.
+
+        Args:
+            record_meta: the field in the datapack for type record that need to
+                fill in for consistency checking.
+        """
+        record_meta["ft.onto.base_ontology.Token"] = set()
+
+
+class NLTKSentenceSegmenter(PackProcessor):
+    r"""A wrapper of NLTK sentence tokenizer."""
+
+    def initialize(self, resources: Resources, configs: Config):
+        super().initialize(resources, configs)
+        download("punkt")
+
+    def __init__(self):
+        super().__init__()
+        self.sent_splitter = PunktSentenceTokenizer()
+
+    def _process(self, input_pack: DataPack):
+        for begin, end in self.sent_splitter.span_tokenize(input_pack.text):
+            Sentence(input_pack, begin, end)
+
+    def record(self, record_meta: Dict[str, Set[str]]):
+        r"""Method to add output type record of `NLTKSentenceSegmenter`, which
+        is `ft.onto.base_ontology.Sentence`
+        to :attr:`forte.data.data_pack.Meta.record`.
+
+        Args:
+            record_meta: the field in the datapack for type record that need to
+                fill in for consistency checking.
+        """
+        record_meta["ft.onto.base_ontology.Sentence"] = set()
+
+
+class NLTKPOSTagger(PackProcessor):
+    r"""A wrapper of NLTK pos tagger."""
+
+    def initialize(self, resources: Resources, configs: Config):
+        super().initialize(resources, configs)
+        download("averaged_perceptron_tagger")
+
+    def __init__(self):
+        super().__init__()
+        self.token_component = None
+
+    def _process(self, input_pack: DataPack):
+        token_entries = list(
+            input_pack.get(entry_type=Token, components=self.token_component)
+        )
+        token_texts = [token.text for token in token_entries]
+        taggings = pos_tag(token_texts)
+        for token, tag in zip(token_entries, taggings):
+            token.pos = tag[1]
+
+    def record(self, record_meta: Dict[str, Set[str]]):
+        r"""Method to add output type record of `NLTKPOSTagger`, which adds
+        attribute `pos` to `ft.onto.base_ontology.Token`
+        to :attr:`forte.data.data_pack.Meta.record`.
+
+        Args:
+            record_meta: the field in the datapack for type record that need to
+                fill in for consistency checking.
+        """
+        record_meta["ft.onto.base_ontology.Token"].add("pos")
+
+    def expected_types_and_attributes(self):
+        r"""Method to add expected type `ft.onto.base_ontology.Token` for input
+        which would be checked before running the processor if
+        the pipeline is initialized with
+        `enforce_consistency=True` or
+        :meth:`~forte.pipeline.Pipeline.enforce_consistency` was enabled for
+        the pipeline.
+        """
+        return {"ft.onto.base_ontology.Token": set()}
+
+
+class NLP_Pipeline_Performance_Test(unittest.TestCase):
+    """
+    Test performance of POS, NER.
+    """
+
+    def setUp(self) -> None:
+        self.nlp = Pipeline[DataPack]()
+        # self.nlp.set_reader(StringReader())
+
+    def test_POS_tagging(self):  # input_output_pair , , input_path : str
+        """
+        Verify the intermediate representation of pipeline.
+        """
+        pack_output = "pack_out"
+        input_path = (
+            "/Users/jamesxiao/Downloads/Semantic-Role-Labeling-master/conll-formatted-ontonotes-5.0/"
+            "data/conll-2012-test/data/english/annotations/bc/phoenix/00/"
+        )  # msnbc_0007.gold_conll
+
+        self.nlp.set_reader(OntonotesReader())
+        # self.nlp.set_reader(StringReader())
+        self.nlp.add(NLTKSentenceSegmenter())  # SentenceAndTokenProcessor
+        self.nlp.add(NLTKWordTokenizer())
+        self.nlp.add(NLTKPOSTagger())  #  #ExampleNLTKPOSTagger()
+
+        # self.nlp.add(SentenceAndTokenProcessor())  #, {"processors": ["sentence", "tokenize"]}
+        # self.nlp.add(ExampleNLTKPOSTagger())
+
+        # self.nlp.add(
+        #     PackNameJsonPackWriter(),
+        #     {
+        #         "output_dir": pack_output,
+        #         "indent": 2,
+        #         "overwrite": True,
+        #     },
+        # )
+
+        input_string = (
+            "Forte is a data-centric ML framework. Muad Dib learned rapidly because his first training was in how to learn. "
+            "And the first lesson of all was the basic trust that he could learn. "
+            " It's shocking to find how many people do not believe they can learn, and how many more believe learning to be difficult. "
+        )
+
+        # self.nlp.initialize()
+        # rs = self.nlp.run(input_path)
+        for pack in self.nlp.initialize().process_dataset(
+            input_path
+        ):  # initialize().run(input_path):   #:  rs:  #
+            for sentence in pack.get("ft.onto.base_ontology.Sentence"):
+                print("The sentence is: ", sentence.text)
+                print("The POS tags of the tokens are:")
+                for token in pack.get(Token, sentence):
+                    print(f" {token.text}[{token.pos}]", end=" ")
+                print()
+
+
+def define_skip_condition(flag: str, explanation: str):
+    return unittest.skipUnless(
+        os.environ.get(flag, 0) or os.environ.get("TEST_ALL", 0),
+        explanation + f" Set `{flag}=1` or `TEST_ALL=1` to run.",
+    )
+
+
+performance_test = define_skip_condition(
+    "TEST_PERFORMANCE", "Test the performance of Forte modules."
+)

From 413bcaf5b72b49d63ef4f3009b59d7bda8c3def3 Mon Sep 17 00:00:00 2001
From: JamesX <james.xiao@mbzuai.ac.ae>
Date: Tue, 18 Oct 2022 17:44:48 +0400
Subject: [PATCH 2/7] Fixed a few parameter issues (input_path need to be
 supplied from parameter)

---
 tests/forte/data/data_pack_profiling_test.py | 42 +++++++++++---------
 1 file changed, 23 insertions(+), 19 deletions(-)

diff --git a/tests/forte/data/data_pack_profiling_test.py b/tests/forte/data/data_pack_profiling_test.py
index 5e2a54415..1e8e57956 100644
--- a/tests/forte/data/data_pack_profiling_test.py
+++ b/tests/forte/data/data_pack_profiling_test.py
@@ -12,11 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Utils for unit tests.
+profiling test for data pack: using entry (0.2.0) and using new methods (>0.3.0).
 """
 
 import os
-import re
 import unittest
 import nltk
 
@@ -50,6 +49,7 @@
 from forte import Pipeline
 from nltk.tokenize.treebank import TreebankWordTokenizer
 
+
 # from fortex.spacy import SpacyProcessor
 
 
@@ -155,7 +155,7 @@ def record(record_meta: Dict[str, Set[str]]):
         record_meta["ft.onto.base_ontology.Token"].add("lemma")
 
     def process_tokens(
-        processors, sentences, input_pack: DataPack
+            processors, sentences, input_pack: DataPack
     ) -> Dict[int, Token]:
         """Basic tokenization and post tagging of the sentence.
         Args:
@@ -353,21 +353,31 @@ def setUp(self) -> None:
         self.nlp = Pipeline[DataPack]()
         # self.nlp.set_reader(StringReader())
 
-    def test_POS_tagging(self):  # input_output_pair , , input_path : str
+    def test_POS_tagging(self, input_path: str = ''):  # input_output_pair ,
         """
         Verify the intermediate representation of pipeline.
         """
         pack_output = "pack_out"
-        input_path = (
-            "/Users/jamesxiao/Downloads/Semantic-Role-Labeling-master/conll-formatted-ontonotes-5.0/"
-            "data/conll-2012-test/data/english/annotations/bc/phoenix/00/"
-        )  # msnbc_0007.gold_conll
-
-        self.nlp.set_reader(OntonotesReader())
-        # self.nlp.set_reader(StringReader())
+        # input_path = (
+        #     "...Path to ... /Semantic-Role-Labeling-master/conll-formatted-ontonotes-5.0/"
+        #     "data/conll-2012-test/data/english/annotations/bc/phoenix/00/"
+        # ) 
+        if len(input_path) == 0:
+            self.nlp.set_reader(StringReader())
+            input_param = (
+                "Forte is a data-centric ML framework. Muad Dib learned \
+                rapidly because his first training was in how to learn. "
+                "And the first lesson of all was the basic trust that he \
+                could learn. "
+                "It's shocking to find how many people do not believe they \
+                can learn, and how many more believe learning to be difficult."
+            )
+        else:
+            self.nlp.set_reader(OntonotesReader())
+            input_param = input_path
         self.nlp.add(NLTKSentenceSegmenter())  # SentenceAndTokenProcessor
         self.nlp.add(NLTKWordTokenizer())
-        self.nlp.add(NLTKPOSTagger())  #  #ExampleNLTKPOSTagger()
+        self.nlp.add(NLTKPOSTagger())  # #ExampleNLTKPOSTagger()
 
         # self.nlp.add(SentenceAndTokenProcessor())  #, {"processors": ["sentence", "tokenize"]}
         # self.nlp.add(ExampleNLTKPOSTagger())
@@ -381,16 +391,10 @@ def test_POS_tagging(self):  # input_output_pair , , input_path : str
         #     },
         # )
 
-        input_string = (
-            "Forte is a data-centric ML framework. Muad Dib learned rapidly because his first training was in how to learn. "
-            "And the first lesson of all was the basic trust that he could learn. "
-            " It's shocking to find how many people do not believe they can learn, and how many more believe learning to be difficult. "
-        )
-
         # self.nlp.initialize()
         # rs = self.nlp.run(input_path)
         for pack in self.nlp.initialize().process_dataset(
-            input_path
+                input_param
         ):  # initialize().run(input_path):   #:  rs:  #
             for sentence in pack.get("ft.onto.base_ontology.Sentence"):
                 print("The sentence is: ", sentence.text)

From 5e2da8e266ccf63769c339f7db4a838ab5a8b64d Mon Sep 17 00:00:00 2001
From: JamesX <james.xiao@mbzuai.ac.ae>
Date: Fri, 28 Oct 2022 13:14:53 +0400
Subject: [PATCH 3/7] Added NER and serialization test

---
 tests/forte/data/data_pack_profiling_test.py | 71 ++++++++++++++++++--
 1 file changed, 64 insertions(+), 7 deletions(-)

diff --git a/tests/forte/data/data_pack_profiling_test.py b/tests/forte/data/data_pack_profiling_test.py
index 1e8e57956..f2dcdf30b 100644
--- a/tests/forte/data/data_pack_profiling_test.py
+++ b/tests/forte/data/data_pack_profiling_test.py
@@ -155,7 +155,7 @@ def record(record_meta: Dict[str, Set[str]]):
         record_meta["ft.onto.base_ontology.Token"].add("lemma")
 
     def process_tokens(
-            processors, sentences, input_pack: DataPack
+        processors, sentences, input_pack: DataPack
     ) -> Dict[int, Token]:
         """Basic tokenization and post tagging of the sentence.
         Args:
@@ -353,15 +353,14 @@ def setUp(self) -> None:
         self.nlp = Pipeline[DataPack]()
         # self.nlp.set_reader(StringReader())
 
-    def test_POS_tagging(self, input_path: str = ''):  # input_output_pair ,
+    def testPOSTaggingNER(self, input_path: str = ""):  # input_output_pair ,
         """
         Verify the intermediate representation of pipeline.
         """
-        pack_output = "pack_out"
         # input_path = (
-        #     "...Path to ... /Semantic-Role-Labeling-master/conll-formatted-ontonotes-5.0/"
+        #     "..Path_to.../Semantic-Role-Labeling-master/conll-formatted-ontonotes-5.0/"
         #     "data/conll-2012-test/data/english/annotations/bc/phoenix/00/"
-        # ) 
+        # )
         if len(input_path) == 0:
             self.nlp.set_reader(StringReader())
             input_param = (
@@ -378,14 +377,17 @@ def test_POS_tagging(self, input_path: str = ''):  # input_output_pair ,
         self.nlp.add(NLTKSentenceSegmenter())  # SentenceAndTokenProcessor
         self.nlp.add(NLTKWordTokenizer())
         self.nlp.add(NLTKPOSTagger())  # #ExampleNLTKPOSTagger()
+        self.nlp.add(NLTKNER())
 
         # self.nlp.add(SentenceAndTokenProcessor())  #, {"processors": ["sentence", "tokenize"]}
         # self.nlp.add(ExampleNLTKPOSTagger())
 
+        # pack_output_dir = "./test_pack_output/"
+        #
         # self.nlp.add(
         #     PackNameJsonPackWriter(),
         #     {
-        #         "output_dir": pack_output,
+        #         "output_dir": pack_output_dir,
         #         "indent": 2,
         #         "overwrite": True,
         #     },
@@ -394,7 +396,7 @@ def test_POS_tagging(self, input_path: str = ''):  # input_output_pair ,
         # self.nlp.initialize()
         # rs = self.nlp.run(input_path)
         for pack in self.nlp.initialize().process_dataset(
-                input_param
+            input_param
         ):  # initialize().run(input_path):   #:  rs:  #
             for sentence in pack.get("ft.onto.base_ontology.Sentence"):
                 print("The sentence is: ", sentence.text)
@@ -403,6 +405,61 @@ def test_POS_tagging(self, input_path: str = ''):  # input_output_pair ,
                     print(f" {token.text}[{token.pos}]", end=" ")
                 print()
 
+    def testSimpleSerialization(self, input_path: str = ""):
+        """
+        Verify the intermediate representation of pipeline.
+        """
+        # input_path = (
+        #     "...Path_to.../Semantic-Role-Labeling-master/conll-formatted-ontonotes-5.0/"
+        #     "data/conll-2012-test/data/english/annotations/bc/phoenix/00/"
+        # )
+        output_path = "./test_simple_pack_output/"
+
+        if len(input_path) == 0:
+            self.nlp.set_reader(StringReader())
+            input_param = (
+                "Forte is a data-centric ML framework. Muad Dib learned \
+                rapidly because his first training was in how to learn. "
+                "And the first lesson of all was the basic trust that he \
+                could learn. "
+                "It's shocking to find how many people do not believe they \
+                can learn, and how many more believe learning to be difficult."
+            )
+        else:
+            self.nlp.set_reader(OntonotesReader())
+            input_param = input_path
+
+        self.nlp.add(
+            PackNameJsonPackWriter(),
+            {
+                "output_dir": output_path,
+                "indent": 2,
+                "overwrite": True,
+            },
+        )
+
+        # self.nlp.add(NLTKSentenceSegmenter())  # SentenceAndTokenProcessor
+        # self.nlp.add(NLTKWordTokenizer())
+        # self.nlp.add(NLTKPOSTagger())  # #ExampleNLTKPOSTagger()
+
+        self.nlp.run(input_param)
+
+        coref_pl = Pipeline()
+        coref_pl.set_reader(DirPackReader())
+        # coref_pl.add(MultiPackBoxer())
+
+        coref_pl.run(output_path)
+
+        # for pack in self.nlp.initialize().process_dataset(
+        #         output_path #input_param
+        # ):  # initialize().run(input_path):   #:  rs:  #
+        #     for sentence in pack.get("ft.onto.base_ontology.Sentence"):
+        #         print("The sentence is: ", sentence.text)
+        #         print("The POS tags of the tokens are:")
+        #         for token in pack.get(Token, sentence):
+        #             print(f" {token.text}[{token.pos}]", end=" ")
+        #         print()
+
 
 def define_skip_condition(flag: str, explanation: str):
     return unittest.skipUnless(

From 8d12c4e009706938ac7d8b9e19258bab04078bc8 Mon Sep 17 00:00:00 2001
From: JamesX <james.xiao@mbzuai.ac.ae>
Date: Fri, 18 Nov 2022 11:42:07 +0400
Subject: [PATCH 4/7] PR submission for the current version of testing (that
 detects bottle neck related to nested generator/sortedlist area in DataPack.

---
 tests/forte/data/data_pack_profiling_test.py | 45 +++++---------------
 1 file changed, 10 insertions(+), 35 deletions(-)

diff --git a/tests/forte/data/data_pack_profiling_test.py b/tests/forte/data/data_pack_profiling_test.py
index f2dcdf30b..0b01bd940 100644
--- a/tests/forte/data/data_pack_profiling_test.py
+++ b/tests/forte/data/data_pack_profiling_test.py
@@ -23,9 +23,7 @@
     "performance_test",
 ]
 
-from typing import Any, Callable
-
-from typing import Optional, Dict, Set, List, Any, Iterator
+from typing import Dict, Set
 
 from forte.common.configuration import Config
 from forte.common.resources import Resources
@@ -33,7 +31,7 @@
 from forte.data.readers import OntonotesReader, DirPackReader, StringReader
 from forte.processors.writers import PackNameJsonPackWriter
 
-from ft.onto.base_ontology import EntityMention, Token, Sentence, Phrase
+from ft.onto.base_ontology import EntityMention, Token, Sentence
 from nltk import (  # type: ignore
     pos_tag,
     ne_chunk,
@@ -49,7 +47,6 @@
 from forte import Pipeline
 from nltk.tokenize.treebank import TreebankWordTokenizer
 
-
 # from fortex.spacy import SpacyProcessor
 
 
@@ -357,10 +354,10 @@ def testPOSTaggingNER(self, input_path: str = ""):  # input_output_pair ,
         """
         Verify the intermediate representation of pipeline.
         """
-        # input_path = (
-        #     "..Path_to.../Semantic-Role-Labeling-master/conll-formatted-ontonotes-5.0/"
-        #     "data/conll-2012-test/data/english/annotations/bc/phoenix/00/"
-        # )
+        input_path = (
+            "/Users/jamesxiao/Downloads/Semantic-Role-Labeling-master/conll-formatted-ontonotes-5.0/"
+            "data/conll-2012-test/data/english/annotations/bc/phoenix/00/"
+        )
         if len(input_path) == 0:
             self.nlp.set_reader(StringReader())
             input_param = (
@@ -382,17 +379,6 @@ def testPOSTaggingNER(self, input_path: str = ""):  # input_output_pair ,
         # self.nlp.add(SentenceAndTokenProcessor())  #, {"processors": ["sentence", "tokenize"]}
         # self.nlp.add(ExampleNLTKPOSTagger())
 
-        # pack_output_dir = "./test_pack_output/"
-        #
-        # self.nlp.add(
-        #     PackNameJsonPackWriter(),
-        #     {
-        #         "output_dir": pack_output_dir,
-        #         "indent": 2,
-        #         "overwrite": True,
-        #     },
-        # )
-
         # self.nlp.initialize()
         # rs = self.nlp.run(input_path)
         for pack in self.nlp.initialize().process_dataset(
@@ -409,10 +395,10 @@ def testSimpleSerialization(self, input_path: str = ""):
         """
         Verify the intermediate representation of pipeline.
         """
-        # input_path = (
-        #     "...Path_to.../Semantic-Role-Labeling-master/conll-formatted-ontonotes-5.0/"
-        #     "data/conll-2012-test/data/english/annotations/bc/phoenix/00/"
-        # )
+        input_path = (
+            "/Users/jamesxiao/Downloads/Semantic-Role-Labeling-master/conll-formatted-ontonotes-5.0/"
+            "data/conll-2012-test/data/english/annotations/bc/phoenix/00/"
+        )
         output_path = "./test_simple_pack_output/"
 
         if len(input_path) == 0:
@@ -447,19 +433,8 @@ def testSimpleSerialization(self, input_path: str = ""):
         coref_pl = Pipeline()
         coref_pl.set_reader(DirPackReader())
         # coref_pl.add(MultiPackBoxer())
-
         coref_pl.run(output_path)
 
-        # for pack in self.nlp.initialize().process_dataset(
-        #         output_path #input_param
-        # ):  # initialize().run(input_path):   #:  rs:  #
-        #     for sentence in pack.get("ft.onto.base_ontology.Sentence"):
-        #         print("The sentence is: ", sentence.text)
-        #         print("The POS tags of the tokens are:")
-        #         for token in pack.get(Token, sentence):
-        #             print(f" {token.text}[{token.pos}]", end=" ")
-        #         print()
-
 
 def define_skip_condition(flag: str, explanation: str):
     return unittest.skipUnless(

From ce8a1d2cffa663eba5b725a5713c6f3ff2a82ba0 Mon Sep 17 00:00:00 2001
From: JamesX <james.xiao@mbzuai.ac.ae>
Date: Fri, 18 Nov 2022 12:11:32 +0400
Subject: [PATCH 5/7] Fixed related testing directory issue (remove dir name on
 local machine) : please provide conll data directory to the commented out
 "input_dir" parameter in code.

---
 tests/forte/data/data_pack_profiling_test.py | 33 ++++++++++----------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/tests/forte/data/data_pack_profiling_test.py b/tests/forte/data/data_pack_profiling_test.py
index 0b01bd940..9608b65ac 100644
--- a/tests/forte/data/data_pack_profiling_test.py
+++ b/tests/forte/data/data_pack_profiling_test.py
@@ -354,10 +354,10 @@ def testPOSTaggingNER(self, input_path: str = ""):  # input_output_pair ,
         """
         Verify the intermediate representation of pipeline.
         """
-        input_path = (
-            "/Users/jamesxiao/Downloads/Semantic-Role-Labeling-master/conll-formatted-ontonotes-5.0/"
-            "data/conll-2012-test/data/english/annotations/bc/phoenix/00/"
-        )
+        # input_path = (
+        #     "...path_to_conll ... /Semantic-Role-Labeling-master/conll-formatted-ontonotes-5.0/"
+        #     "data/conll-2012-test/data/english/annotations/bc/phoenix/00/"
+        # )
         if len(input_path) == 0:
             self.nlp.set_reader(StringReader())
             input_param = (
@@ -395,10 +395,10 @@ def testSimpleSerialization(self, input_path: str = ""):
         """
         Verify the intermediate representation of pipeline.
         """
-        input_path = (
-            "/Users/jamesxiao/Downloads/Semantic-Role-Labeling-master/conll-formatted-ontonotes-5.0/"
-            "data/conll-2012-test/data/english/annotations/bc/phoenix/00/"
-        )
+        # input_path = (
+        #     "... path_to_conll ... /Semantic-Role-Labeling-master/conll-formatted-ontonotes-5.0/"
+        #     "data/conll-2012-test/data/english/annotations/bc/phoenix/00/"
+        # )
         output_path = "./test_simple_pack_output/"
 
         if len(input_path) == 0:
@@ -414,15 +414,14 @@ def testSimpleSerialization(self, input_path: str = ""):
         else:
             self.nlp.set_reader(OntonotesReader())
             input_param = input_path
-
-        self.nlp.add(
-            PackNameJsonPackWriter(),
-            {
-                "output_dir": output_path,
-                "indent": 2,
-                "overwrite": True,
-            },
-        )
+            self.nlp.add(
+                PackNameJsonPackWriter(),
+                {
+                    "output_dir": output_path,
+                    "indent": 2,
+                    "overwrite": True,
+                },
+            )
 
         # self.nlp.add(NLTKSentenceSegmenter())  # SentenceAndTokenProcessor
         # self.nlp.add(NLTKWordTokenizer())

From d5e714a289c9ce649cb854d8286a265f0f5411a4 Mon Sep 17 00:00:00 2001
From: JamesX <james.xiao@mbzuai.ac.ae>
Date: Fri, 18 Nov 2022 12:31:13 +0400
Subject: [PATCH 6/7] Fix output dir issue in test (removed local dir name):
 please also provide output local dir name in serialization test case.

---
 tests/forte/data/data_pack_profiling_test.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/forte/data/data_pack_profiling_test.py b/tests/forte/data/data_pack_profiling_test.py
index 9608b65ac..3aaae763f 100644
--- a/tests/forte/data/data_pack_profiling_test.py
+++ b/tests/forte/data/data_pack_profiling_test.py
@@ -391,7 +391,7 @@ def testPOSTaggingNER(self, input_path: str = ""):  # input_output_pair ,
                     print(f" {token.text}[{token.pos}]", end=" ")
                 print()
 
-    def testSimpleSerialization(self, input_path: str = ""):
+    def testSimpleSerialization(self, input_path: str = "", output_path: str = ""):
         """
         Verify the intermediate representation of pipeline.
         """
@@ -399,7 +399,7 @@ def testSimpleSerialization(self, input_path: str = ""):
         #     "... path_to_conll ... /Semantic-Role-Labeling-master/conll-formatted-ontonotes-5.0/"
         #     "data/conll-2012-test/data/english/annotations/bc/phoenix/00/"
         # )
-        output_path = "./test_simple_pack_output/"
+        # output_path = "./test_simple_pack_output/"
 
         if len(input_path) == 0:
             self.nlp.set_reader(StringReader())
@@ -432,7 +432,8 @@ def testSimpleSerialization(self, input_path: str = ""):
         coref_pl = Pipeline()
         coref_pl.set_reader(DirPackReader())
         # coref_pl.add(MultiPackBoxer())
-        coref_pl.run(output_path)
+        if len(output_path) > 0:
+            coref_pl.run(output_path)
 
 
 def define_skip_condition(flag: str, explanation: str):

From 3b08be5e939b7009c66994c17f9842d875d970db Mon Sep 17 00:00:00 2001
From: JamesX <james.xiao@mbzuai.ac.ae>
Date: Tue, 3 Jan 2023 17:42:24 +0400
Subject: [PATCH 7/7] Fixed multiple comments for this PR.

---
 tests/forte/data/data_pack_profiling_test.py | 101 ++-----------------
 1 file changed, 8 insertions(+), 93 deletions(-)

diff --git a/tests/forte/data/data_pack_profiling_test.py b/tests/forte/data/data_pack_profiling_test.py
index 3aaae763f..98cfce492 100644
--- a/tests/forte/data/data_pack_profiling_test.py
+++ b/tests/forte/data/data_pack_profiling_test.py
@@ -12,18 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-profiling test for data pack: using entry (0.2.0) and using new methods (>0.3.0).
+profiling test for data pack: using typical usage scenarios such as POS
+tagging, NER, serialization to check for possible bottlenecks.
 """
 
 import os
 import unittest
-import nltk
 
 __all__ = [
     "performance_test",
 ]
 
-from typing import Dict, Set
+from typing import Dict, Set, List
 
 from forte.common.configuration import Config
 from forte.common.resources import Resources
@@ -47,25 +47,17 @@
 from forte import Pipeline
 from nltk.tokenize.treebank import TreebankWordTokenizer
 
-# from fortex.spacy import SpacyProcessor
-
 
 class SentenceAndTokenProcessor(PackProcessor):
-    def __init__(self):
-        super().__init__()
-
-    def initialize(self, resources, configs):
-        super().initialize(resources, configs)
 
     def process_tokens(self, sentences, input_pack: DataPack):
         """Basic tokenization and post tagging of the sentence.
         Args:
-            processors: List of processor names.
             sentences: Generator object which yields sentences in document.
             input_pack: input pack which needs to be modified.
         Returns: A mapping from SpaCy token index to Forte Token.
         """
-        tokens: [Token] = []
+        tokens: List[Token] = []
 
         last_sentence_word_idx = 0
         for s_idx, sentence in sentences:
@@ -85,7 +77,7 @@ def _process(self, input_pack: DataPack):
         sentences = sent_tokenize(doc)
 
         # tokens = process_tokens(sentences, input_pack)   # sentences, input_pack
-        tokens: [Token] = []
+        tokens: List[Token] = []
 
         last_sentence_word_idx = 0
         s_idx = 0
@@ -117,73 +109,6 @@ def record(self, record_meta: Dict[str, Set[str]]):
         record_meta["ft.onto.base_ontology.Token"] = set()
 
 
-class ExampleNLTKPOSTagger(PackProcessor):
-    r"""A wrapper of NLTK pos tagger."""
-
-    def initialize(self, resources, configs):
-        super().initialize(resources, configs)
-        # download the NLTK average perceptron tagger
-        nltk.download("averaged_perceptron_tagger")
-
-    def _process(self, input_pack: DataPack):
-        # get a list of token data entries from `input_pack`
-        # using `DataPack.get()`` method
-
-        token_texts = [token.text for token in input_pack.get(Token)]
-
-        # use nltk pos tagging module to tag token texts
-        taggings = nltk.pos_tag(token_texts)
-
-        # assign nltk taggings to token attributes
-        for token, tag in zip(input_pack.get(Token), taggings):
-            token.pos = tag[1]
-
-            # token.pos = word.tag_
-
-            # token.lemma = word.lemma_
-
-            # Store the spacy token index to forte token mapping.
-            # indexed_tokens[word.i] = token
-
-        # return indexed_tokens
-
-    def record(record_meta: Dict[str, Set[str]]):
-        record_meta["ft.onto.base_ontology.Token"].add("pos")
-        record_meta["ft.onto.base_ontology.Token"].add("lemma")
-
-    def process_tokens(
-        processors, sentences, input_pack: DataPack
-    ) -> Dict[int, Token]:
-        """Basic tokenization and post tagging of the sentence.
-        Args:
-            processors: List of processor names.
-            sentences: Generator object which yields sentences in document.
-            input_pack: input pack which needs to be modified.
-        Returns: A mapping from SpaCy token index to Forte Token.
-        """
-        indexed_tokens: Dict[int, Token] = {}
-
-        for sentence in sentences:
-            Sentence(input_pack, sentence.start_char, sentence.end_char)
-
-            if "tokenize" in processors:
-                # Iterating through spaCy token objects
-                for word in sentence:
-                    begin_pos_word = word.idx
-                    end_pos_word = begin_pos_word + len(word.text)
-                    token = Token(input_pack, begin_pos_word, end_pos_word)
-
-                    if "pos" in processors:
-                        token.pos = word.tag_
-
-                    if "lemma" in processors:
-                        token.lemma = word.lemma_
-
-                    # Store the spacy token index to forte token mapping.
-                    indexed_tokens[word.i] = token
-        return indexed_tokens
-
-
 class NLTKNER(PackProcessor):
     r"""A wrapper of NLTK NER."""
 
@@ -343,12 +268,11 @@ def expected_types_and_attributes(self):
 
 class NLP_Pipeline_Performance_Test(unittest.TestCase):
     """
-    Test performance of POS, NER.
+    Test performance for POS, NER tasks.
     """
 
     def setUp(self) -> None:
         self.nlp = Pipeline[DataPack]()
-        # self.nlp.set_reader(StringReader())
 
     def testPOSTaggingNER(self, input_path: str = ""):  # input_output_pair ,
         """
@@ -373,16 +297,11 @@ def testPOSTaggingNER(self, input_path: str = ""):  # input_output_pair ,
             input_param = input_path
         self.nlp.add(NLTKSentenceSegmenter())  # SentenceAndTokenProcessor
         self.nlp.add(NLTKWordTokenizer())
-        self.nlp.add(NLTKPOSTagger())  # #ExampleNLTKPOSTagger()
+        self.nlp.add(NLTKPOSTagger())
         self.nlp.add(NLTKNER())
 
-        # self.nlp.add(SentenceAndTokenProcessor())  #, {"processors": ["sentence", "tokenize"]}
-        # self.nlp.add(ExampleNLTKPOSTagger())
-
-        # self.nlp.initialize()
-        # rs = self.nlp.run(input_path)
         for pack in self.nlp.initialize().process_dataset(
-            input_param
+                input_param
         ):  # initialize().run(input_path):   #:  rs:  #
             for sentence in pack.get("ft.onto.base_ontology.Sentence"):
                 print("The sentence is: ", sentence.text)
@@ -423,10 +342,6 @@ def testSimpleSerialization(self, input_path: str = "", output_path: str = ""):
                 },
             )
 
-        # self.nlp.add(NLTKSentenceSegmenter())  # SentenceAndTokenProcessor
-        # self.nlp.add(NLTKWordTokenizer())
-        # self.nlp.add(NLTKPOSTagger())  # #ExampleNLTKPOSTagger()
-
         self.nlp.run(input_param)
 
         coref_pl = Pipeline()