Updated the documentation.

StarlangSoftware · May 24, 2024 · 083d055 · 083d055
1 parent cb47c85
commit 083d055
Show file tree

Hide file tree

Showing 4 changed files with 259 additions and 9 deletions.
diff --git a/src/main/java/MorphologicalAnalysis/FsmMorphologicalAnalyzer.java b/src/main/java/MorphologicalAnalysis/FsmMorphologicalAnalyzer.java
@@ -101,6 +101,11 @@ public FsmMorphologicalAnalyzer(TxtDictionary dictionary) {
         this("turkish_finite_state_machine.xml", dictionary, 10000000);
     }
 
+    /**
+     * Constructs and returns the reverse string of a given string.
+     * @param s String to be reversed.
+     * @return Reverse of a given string.
+     */
     private String reverseString(String s){
         StringBuilder result = new StringBuilder();
         for (int i = s.length() - 1; i >= 0; i--){
@@ -109,6 +114,11 @@ private String reverseString(String s){
         return result.toString();
     }
 
+    /**
+     * Constructs the suffix trie from the input file suffixes.txt. suffixes.txt contains the most frequent 6000
+     * suffixes that a verb or a noun can take. The suffix trie is a trie that stores these suffixes in reverse form,
+     * which can be then used to match a given word for its possible suffix content.
+     */
     private void prepareSuffixTrie(){
         suffixTrie = new Trie();
         Scanner inputFile = new Scanner(FileUtils.getInputStream("suffixes.txt"));
@@ -120,6 +130,11 @@ private void prepareSuffixTrie(){
         inputFile.close();
     }
 
+    /**
+     * Reads the file for correct surface forms and their most frequent root forms, in other words, the surface forms
+     * which have at least one morphological analysis in  Turkish.
+     * @param fileName Input file containing analyzable surface forms and their root forms.
+     */
     public void addParsedSurfaceForms(String fileName){
         parsedSurfaceForms = new HashMap<>();
         String line;
@@ -805,6 +820,11 @@ public ArrayList<FsmParse> morphologicalAnalysis(TxtWord root, String surfaceFor
         return parseWord(initialFsmParse, surfaceForm);
     }
 
+    /**
+     * Given a set of morphological parses, this method returns all surface forms of those parses.
+     * @param parseList Morphological parse list.
+     * @return All distinct surface forms for a given set of morphological parses.
+     */
     private HashSet<String> distinctSurfaceFormList(ArrayList<FsmParse> parseList){
         HashSet<String> items = new HashSet<>();
         for (FsmParse parse : parseList){
@@ -813,6 +833,14 @@ private HashSet<String> distinctSurfaceFormList(ArrayList<FsmParse> parseList){
         return items;
     }
 
+    /**
+     * This method generates all possible surface forms that can be generated by the morphological analyzer with the
+     * current root forms from the dictionary. Since the  number of all possible surface forms is infinity in Turkish,
+     * this method bounds the length of the possible surface forms. It includes only those surface forms, that can be
+     * obtained by adding at most 4 characters to the root form. The method prints all distinct surface forms to an
+     * output file.
+     * @param outputFile Output file that will contain distinct possible surface forms.
+     */
     public void outputAllParses(String outputFile){
         try {
             PrintWriter pw = new PrintWriter(outputFile);
@@ -1074,6 +1102,15 @@ private ArrayList<FsmParse> analysis(String surfaceForm, boolean isProper) {
         return parseWord(initialFsmParse, surfaceForm);
     }
 
+    /**
+     * This method uses cache idea to speed up pattern matching in Fsm. mostUsedPatterns stores the compiled forms of
+     * the previously used patterns. When Fsm tries to match a string to a pattern, first we check if it exists in
+     * mostUsedPatterns. If it exists, we directly use the compiled pattern to match the string. Otherwise, new pattern
+     * is compiled and put in the mostUsedPatterns.
+     * @param expr Pattern to check
+     * @param value String to match the pattern
+     * @return True if the string matches the pattern, false otherwise.
+     */
     private boolean patternMatches(String expr, String value){
         Pattern p = mostUsedPatterns.get(expr);
         if (p == null){
@@ -1101,7 +1138,7 @@ public boolean isProperNoun(String surfaceForm) {
      * The isCode method takes surfaceForm String as input and checks if it consists of both letters and numbers
      *
      * @param surfaceForm String to check for code-like word.
-     * @return true if it is a code-like word, return false otherwise.
+     * @return True if it is a code-like word, return false otherwise.
      */
     private boolean isCode(String surfaceForm) {
         if (surfaceForm == null || surfaceForm.isEmpty()) {
@@ -1110,6 +1147,19 @@ private boolean isCode(String surfaceForm) {
         return patternMatches(".*[0-9].*", surfaceForm) && patternMatches(".*[a-zA-ZçöğüşıÇÖĞÜŞİ].*", surfaceForm);
     }
 
+    /**
+     * Identifies a possible new root word for a given surface form. It also adds the new root form to the dictionary
+     * for further usage. The method first searches the suffix trie for the reverse string of the surface form. This
+     * way, it can identify if the word has a suffix that is in the most frequently used suffix list. Since a word can
+     * have multiple possible suffixes, the method identifies the longest suffix and returns the substring of the
+     * surface form tht does not contain the suffix. Let say the word is 'googlelaştırdık', it will identify 'tık' as
+     * a suffix and will return 'googlelaştır' as a possible root form. Another example will be 'homelesslerimizle', it
+     * will identify 'lerimizle' as suffix and will return 'homeless' as a possible root form. If the root word ends
+     * with 'ğ', it is replacesd with 'k'. 'morfolojikliğini' will return 'morfolojikliğ' then which will be replaced
+     * with 'morfolojiklik'.
+     * @param surfaceForm Surface form for which we will identify a possible new root form.
+     * @return Possible new root form.
+     */
     private TxtWord rootOfPossiblyNewWord(String surfaceForm){
         HashSet<Word> words = suffixTrie.getWordsWithPrefix(reverseString(surfaceForm));
         int maxLength = 0;
@@ -1282,18 +1332,39 @@ private boolean isNumber(String surfaceForm) {
         return word.isEmpty() && count > 1;
     }
 
+    /**
+     * Checks if a given surface form matches to a percent value. It should be something like %4, %45, %4.3 or %56.786
+     * @param surfaceForm Surface form to be checked.
+     * @return True if the surface form is in percent form
+     */
     private boolean isPercent(String surfaceForm){
         return patternMatches("%(\\d\\d|\\d)", surfaceForm) || patternMatches("%(\\d\\d|\\d)\\.\\d+", surfaceForm);
     }
 
+    /**
+     * Checks if a given surface form matches to a time form. It should be something like 3:34, 12:56 etc.
+     * @param surfaceForm Surface form to be checked.
+     * @return True if the surface form is in time form
+     */
     private boolean isTime(String surfaceForm) {
         return patternMatches("(\\d\\d|\\d):(\\d\\d|\\d):(\\d\\d|\\d)", surfaceForm) || patternMatches("(\\d\\d|\\d):(\\d\\d|\\d)", surfaceForm);
     }
 
+    /**
+     * Checks if a given surface form matches to a range form. It should be something like 123-1400 or 12:34-15:78 or
+     * 3.45-4.67.
+     * @param surfaceForm Surface form to be checked.
+     * @return True if the surface form is in range form
+     */
     private boolean isRange(String surfaceForm) {
         return patternMatches("\\d+-\\d+", surfaceForm) || patternMatches("(\\d\\d|\\d):(\\d\\d|\\d)-(\\d\\d|\\d):(\\d\\d|\\d)", surfaceForm) || patternMatches("(\\d\\d|\\d)\\.(\\d\\d|\\d)-(\\d\\d|\\d)\\.(\\d\\d|\\d)", surfaceForm);
     }
 
+    /**
+     * Checks if a given surface form matches to a date form. It should be something like 3/10/2023 or 2.3.2012
+     * @param surfaceForm Surface form to be checked.
+     * @return True if the surface form is in date form
+     */
     private boolean isDate(String surfaceForm) {
         return patternMatches("(\\d\\d|\\d)/(\\d\\d|\\d)/\\d+", surfaceForm) || patternMatches("(\\d\\d|\\d)\\.(\\d\\d|\\d)\\.\\d+", surfaceForm);
     }

diff --git a/src/main/java/MorphologicalAnalysis/MorphologicalParse.java b/src/main/java/MorphologicalAnalysis/MorphologicalParse.java
@@ -571,6 +571,12 @@ public String getTreePos(){
         return "-XXX-";
     }
 
+    /**
+     * Returns the pronoun type of the parse for universal dependency feature ProType.
+     * @return "Art" if the pronoun is also a determiner; "Prs" if the pronoun is personal pronoun; "Rcp" if the
+     * pronoun is 'birbiri'; "Ind" if the pronoun is an indeterminate pronoun; "Neg" if the pronoun is 'hiçbiri';
+     * "Int" if the pronoun is a question pronoun; "Dem" if the pronoun is a demonstrative pronoun.
+     */
     private String getPronType(){
         String lemma = root.getName();
         if (containsTag(MorphologicalTag.DETERMINER)){
@@ -604,6 +610,11 @@ private String getPronType(){
         return null;
     }
 
+    /**
+     * Returns the numeral type of the parse for universal dependency feature NumType.
+     * @return "Ord" if the parse is Time, Ordinal or the word is '%' or 'kaçıncı'; "Dist" if the word is a
+     * distributive number such as 'beşinci'; "Card" if the number is cardinal or any number or the word is 'kaç'.
+     */
     private String getNumType(){
         String lemma = root.getName();
         if (lemma.equals("%") || containsTag(MorphologicalTag.TIME)){
@@ -621,6 +632,10 @@ private String getNumType(){
         return null;
     }
 
+    /**
+     * Returns the value for the dependency feature Reflex.
+     * @return "Yes" if the root word is 'kendi', null otherwise.
+     */
     private String getReflex(){
         String lemma = root.getName();
         if (lemma.equals("kendi")){
@@ -629,6 +644,11 @@ private String getReflex(){
         return null;
     }
 
+    /**
+     * Returns the agreement of the parse for the universal dependency feature Number.
+     * @return "Sing" if the agreement of the parse is singular (contains A1SG, A2SG, A3SG); "Plur" if the agreement
+     * of the parse is plural (contains A1PL, A2PL, A3PL).
+     */
     private String getNumber(){
         if (containsTag(MorphologicalTag.A1SG) || containsTag(MorphologicalTag.A2SG) || containsTag(MorphologicalTag.A3SG)){
             return "Sing";
@@ -639,6 +659,11 @@ private String getNumber(){
         return null;
     }
 
+    /**
+     * Returns the possessive agreement of the parse for the universal dependency feature [Pos].
+     * @return "Sing" if the possessive agreement of the parse is singular (contains P1SG, P2SG, P3SG); "Plur" if the
+     * possessive agreement of the parse is plural (contains P1PL, P2PL, P3PL).
+     */
     private String getPossessiveNumber(){
         if (containsTag(MorphologicalTag.P1SG) || containsTag(MorphologicalTag.P2SG) || containsTag(MorphologicalTag.P3SG)){
             return "Sing";
@@ -649,6 +674,11 @@ private String getPossessiveNumber(){
         return null;
     }
 
+    /**
+     * Returns the case marking of the parse for the universal dependency feature case.
+     * @return "Acc" for accusative marker; "Dat" for dative marker; "Gen" for genitive marker; "Loc" for locative
+     * marker; "Ins" for instrumentative marker; "Abl" for ablative marker; "Nom" for nominative marker.
+     */
     private String getCase(){
         if (containsTag(MorphologicalTag.ACCUSATIVE) || containsTag(MorphologicalTag.PCACCUSATIVE)){
             return "Acc";
@@ -674,6 +704,11 @@ private String getCase(){
         return null;
     }
 
+    /**
+     * Returns the definiteness of the parse for the universal dependency feature definite. It applies only for
+     * determiners in Turkish.
+     * @return "Ind" for 'bir', 'bazı', or 'birkaç'. "Def" for 'her', 'bu', 'şu', 'o', 'bütün'.
+     */
     private String getDefinite(){
         String lemma = root.getName();
         if (containsTag(MorphologicalTag.DETERMINER)){
@@ -687,6 +722,10 @@ private String getDefinite(){
         return null;
     }
 
+    /**
+     * Returns the degree of the parse for the universal dependency feature degree.
+     * @return "Cmp" for comparative adverb 'daha'; "Sup" for superlative adjective or adverb 'en'.
+     */
     private String getDegree(){
         String lemma = root.getName();
         if (lemma.equals("daha")){
@@ -698,6 +737,10 @@ private String getDegree(){
         return null;
     }
 
+    /**
+     * Returns the polarity of the verb for the universal dependency feature polarity.
+     * @return "Pos" for positive polarity containing tag POS; "Neg" for negative polarity containing tag NEG.
+     */
     private String getPolarity(){
         if (containsTag(MorphologicalTag.POSITIVE)){
             return "Pos";
@@ -708,6 +751,10 @@ private String getPolarity(){
         return null;
     }
 
+    /**
+     * Returns the person of the agreement of the parse for the universal dependency feature person.
+     * @return "1" for first person; "2" for second person; "3" for third person.
+     */
     private String getPerson(){
         if (containsTag(MorphologicalTag.A1SG) || containsTag(MorphologicalTag.A1PL)){
             return "1";
@@ -721,6 +768,10 @@ private String getPerson(){
         return null;
     }
 
+    /**
+     * Returns the person of the possessive agreement of the parse for the universal dependency feature [pos].
+     * @return "1" for first person; "2" for second person; "3" for third person.
+     */
     private String getPossessivePerson(){
         if (containsTag(MorphologicalTag.P1SG) || containsTag(MorphologicalTag.P1PL)){
             return "1";
@@ -734,6 +785,12 @@ private String getPossessivePerson(){
         return null;
     }
 
+    /**
+     * Returns the voice of the verb parse for the universal dependency feature voice.
+     * @return "CauPass" if the verb parse is both causative and passive; "Pass" if the verb parse is only passive;
+     * "Rcp" if the verb parse is reciprocal; "Cau" if the verb parse is only causative; "Rfl" if the verb parse is
+     * reflexive.
+     */
     private String getVoice(){
         if (containsTag(MorphologicalTag.CAUSATIVE) && containsTag(MorphologicalTag.PASSIVE)){
             return "CauPass";
@@ -753,6 +810,11 @@ private String getVoice(){
         return null;
     }
 
+    /**
+     * Returns the aspect of the verb parse for the universal dependency feature aspect.
+     * @return "Perf" for past, narrative and future tenses; "Prog" for progressive tenses; "Hab" for Aorist; "Rapid"
+     * for parses containing HASTILY tag; "Dur" for parses containing START, STAY or REPEAT tags.
+     */
     private String getAspect(){
         if (containsTag(MorphologicalTag.PASTTENSE) || containsTag(MorphologicalTag.NARRATIVE) || containsTag(MorphologicalTag.FUTURE)){
             return "Perf";
@@ -772,6 +834,11 @@ private String getAspect(){
         return null;
     }
 
+    /**
+     * Returns the tense of the verb parse for universal dependency feature tense.
+     * @return "Past" for simple past tense; "Fut" for future tense; "Pqp" for narrative past tense; "Pres" for other
+     * past tenses.
+     */
     private String getTense(){
         if (containsTag(MorphologicalTag.PASTTENSE)){
             return "Past";
@@ -788,22 +855,35 @@ private String getTense(){
         return null;
     }
 
+    /**
+     * Returns the modality of the verb parse for the universal dependency feature mood.
+     * @return "GenNecPot" if both necessitative and potential is combined with a suffix of general modality;
+     * "CndGenPot" if both conditional and potential is combined with a suffix of general modality;
+     * "GenNec" if necessitative is combined with a suffix of general modality;
+     * "GenPot" if potential is combined with a suffix of general modality;
+     * "NecPot" if necessitative is combined with potential;
+     * "DesPot" if desiderative is combined with potential;
+     * "CndPot" if conditional is combined with potential;
+     * "CndGen" if conditional is combined with a suffix of general modality;
+     * "Imp" for imperative; "Cnd" for simple conditional; "Des" for simple desiderative; "Opt" for optative; "Nec" for
+     * simple necessitative; "Pot" for simple potential; "Gen" for simple suffix of a general modality.
+     */
     private String getMood(){
         if ((containsTag(MorphologicalTag.COPULA) || containsTag(MorphologicalTag.AORIST)) && containsTag(MorphologicalTag.NECESSITY) && containsTag(MorphologicalTag.ABLE)){
             return "GenNecPot";
         }
-        if (containsTag(MorphologicalTag.CONDITIONAL) && (containsTag(MorphologicalTag.COPULA) || containsTag(MorphologicalTag.AORIST)) && containsTag(MorphologicalTag.ABLE)){
+        if ((containsTag(MorphologicalTag.COPULA) || containsTag(MorphologicalTag.AORIST)) && containsTag(MorphologicalTag.CONDITIONAL) && containsTag(MorphologicalTag.ABLE)){
             return "CndGenPot";
         }
         if ((containsTag(MorphologicalTag.COPULA) || containsTag(MorphologicalTag.AORIST)) && containsTag(MorphologicalTag.NECESSITY)){
             return "GenNec";
         }
-        if (containsTag(MorphologicalTag.NECESSITY) && containsTag(MorphologicalTag.ABLE)){
-            return "NecPot";
-        }
         if ((containsTag(MorphologicalTag.COPULA) || containsTag(MorphologicalTag.AORIST)) && containsTag(MorphologicalTag.ABLE)){
             return "GenPot";
         }
+        if (containsTag(MorphologicalTag.NECESSITY) && containsTag(MorphologicalTag.ABLE)){
+            return "NecPot";
+        }
         if (containsTag(MorphologicalTag.DESIRE) && containsTag(MorphologicalTag.ABLE)){
             return "DesPot";
         }
@@ -843,6 +923,11 @@ private String getMood(){
         return null;
     }
 
+    /**
+     * Returns the form of the verb parse for the universal dependency feature verbForm.
+     * @return "Part" for participles; "Vnoun" for infinitives; "Conv" for parses contaning tags SINCEDOINGSO,
+     * WITHOUTHAVINGDONESO, WITHOUTBEINGABLETOHAVEDONESO, BYDOINGSO, AFTERDOINGSO, INFINITIVE3; "Fin" for others.
+     */
     private String getVerbForm(){
         if (containsTag(MorphologicalTag.PASTPARTICIPLE) || containsTag(MorphologicalTag.FUTUREPARTICIPLE) || containsTag(MorphologicalTag.PRESENTPARTICIPLE)){
             return "Part";
@@ -862,6 +947,12 @@ private String getVerbForm(){
         return null;
     }
 
+    /**
+     * Construct the universal dependency features as an array of strings. Each element represents a single feature.
+     * Every feature is given as featureType = featureValue.
+     * @param uPos Universal dependency part of speech tag for the parse.
+     * @return An array of universal dependency features for this parse.
+     */
     public ArrayList<String> getUniversalDependencyFeatures(String uPos){
         ArrayList<String> featureList = new ArrayList<>();
         String pronType = getPronType();
@@ -940,6 +1031,12 @@ public ArrayList<String> getUniversalDependencyFeatures(String uPos){
         return featureList;
     }
 
+    /**
+     * Returns the universal dependency part of speech for this parse.
+     * @return "AUX" for word 'değil; "PROPN" for proper nouns; "NOUN for nouns; "ADJ" for adjectives; "ADV" for
+     * adverbs; "INTJ" for interjections; "VERB" for verbs; "PUNCT" for punctuation symbols; "DET" for determiners;
+     * "NUM" for numerals; "PRON" for pronouns; "ADP" for post participles; "SCONJ" or "CCONJ" for conjunctions.
+     */
     public String getUniversalDependencyPos(){
         String lemma = root.getName();
         if (lemma.equals("değil")){