Skip to content

Commit

Permalink
Updated the documentation.
Browse files Browse the repository at this point in the history
  • Loading branch information
Olcay Taner YILDIZ committed May 24, 2024
1 parent cb47c85 commit 083d055
Show file tree
Hide file tree
Showing 4 changed files with 259 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,11 @@ public FsmMorphologicalAnalyzer(TxtDictionary dictionary) {
this("turkish_finite_state_machine.xml", dictionary, 10000000);
}

/**
* Constructs and returns the reverse string of a given string.
* @param s String to be reversed.
* @return Reverse of a given string.
*/
private String reverseString(String s){
StringBuilder result = new StringBuilder();
for (int i = s.length() - 1; i >= 0; i--){
Expand All @@ -109,6 +114,11 @@ private String reverseString(String s){
return result.toString();
}

/**
* Constructs the suffix trie from the input file suffixes.txt. suffixes.txt contains the most frequent 6000
* suffixes that a verb or a noun can take. The suffix trie is a trie that stores these suffixes in reverse form,
* which can be then used to match a given word for its possible suffix content.
*/
private void prepareSuffixTrie(){
suffixTrie = new Trie();
Scanner inputFile = new Scanner(FileUtils.getInputStream("suffixes.txt"));
Expand All @@ -120,6 +130,11 @@ private void prepareSuffixTrie(){
inputFile.close();
}

/**
* Reads the file for correct surface forms and their most frequent root forms, in other words, the surface forms
* which have at least one morphological analysis in Turkish.
* @param fileName Input file containing analyzable surface forms and their root forms.
*/
public void addParsedSurfaceForms(String fileName){
parsedSurfaceForms = new HashMap<>();
String line;
Expand Down Expand Up @@ -805,6 +820,11 @@ public ArrayList<FsmParse> morphologicalAnalysis(TxtWord root, String surfaceFor
return parseWord(initialFsmParse, surfaceForm);
}

/**
* Given a set of morphological parses, this method returns all surface forms of those parses.
* @param parseList Morphological parse list.
* @return All distinct surface forms for a given set of morphological parses.
*/
private HashSet<String> distinctSurfaceFormList(ArrayList<FsmParse> parseList){
HashSet<String> items = new HashSet<>();
for (FsmParse parse : parseList){
Expand All @@ -813,6 +833,14 @@ private HashSet<String> distinctSurfaceFormList(ArrayList<FsmParse> parseList){
return items;
}

/**
* This method generates all possible surface forms that can be generated by the morphological analyzer with the
* current root forms from the dictionary. Since the number of all possible surface forms is infinity in Turkish,
* this method bounds the length of the possible surface forms. It includes only those surface forms, that can be
* obtained by adding at most 4 characters to the root form. The method prints all distinct surface forms to an
* output file.
* @param outputFile Output file that will contain distinct possible surface forms.
*/
public void outputAllParses(String outputFile){
try {
PrintWriter pw = new PrintWriter(outputFile);
Expand Down Expand Up @@ -1074,6 +1102,15 @@ private ArrayList<FsmParse> analysis(String surfaceForm, boolean isProper) {
return parseWord(initialFsmParse, surfaceForm);
}

/**
* This method uses cache idea to speed up pattern matching in Fsm. mostUsedPatterns stores the compiled forms of
* the previously used patterns. When Fsm tries to match a string to a pattern, first we check if it exists in
* mostUsedPatterns. If it exists, we directly use the compiled pattern to match the string. Otherwise, new pattern
* is compiled and put in the mostUsedPatterns.
* @param expr Pattern to check
* @param value String to match the pattern
* @return True if the string matches the pattern, false otherwise.
*/
private boolean patternMatches(String expr, String value){
Pattern p = mostUsedPatterns.get(expr);
if (p == null){
Expand Down Expand Up @@ -1101,7 +1138,7 @@ public boolean isProperNoun(String surfaceForm) {
* The isCode method takes surfaceForm String as input and checks if it consists of both letters and numbers
*
* @param surfaceForm String to check for code-like word.
* @return true if it is a code-like word, return false otherwise.
* @return True if it is a code-like word, return false otherwise.
*/
private boolean isCode(String surfaceForm) {
if (surfaceForm == null || surfaceForm.isEmpty()) {
Expand All @@ -1110,6 +1147,19 @@ private boolean isCode(String surfaceForm) {
return patternMatches(".*[0-9].*", surfaceForm) && patternMatches(".*[a-zA-ZçöğüşıÇÖĞÜŞİ].*", surfaceForm);
}

/**
* Identifies a possible new root word for a given surface form. It also adds the new root form to the dictionary
* for further usage. The method first searches the suffix trie for the reverse string of the surface form. This
* way, it can identify if the word has a suffix that is in the most frequently used suffix list. Since a word can
* have multiple possible suffixes, the method identifies the longest suffix and returns the substring of the
* surface form tht does not contain the suffix. Let say the word is 'googlelaştırdık', it will identify 'tık' as
* a suffix and will return 'googlelaştır' as a possible root form. Another example will be 'homelesslerimizle', it
* will identify 'lerimizle' as suffix and will return 'homeless' as a possible root form. If the root word ends
* with 'ğ', it is replacesd with 'k'. 'morfolojikliğini' will return 'morfolojikliğ' then which will be replaced
* with 'morfolojiklik'.
* @param surfaceForm Surface form for which we will identify a possible new root form.
* @return Possible new root form.
*/
private TxtWord rootOfPossiblyNewWord(String surfaceForm){
HashSet<Word> words = suffixTrie.getWordsWithPrefix(reverseString(surfaceForm));
int maxLength = 0;
Expand Down Expand Up @@ -1282,18 +1332,39 @@ private boolean isNumber(String surfaceForm) {
return word.isEmpty() && count > 1;
}

/**
* Checks if a given surface form matches to a percent value. It should be something like %4, %45, %4.3 or %56.786
* @param surfaceForm Surface form to be checked.
* @return True if the surface form is in percent form
*/
private boolean isPercent(String surfaceForm){
return patternMatches("%(\\d\\d|\\d)", surfaceForm) || patternMatches("%(\\d\\d|\\d)\\.\\d+", surfaceForm);
}

/**
* Checks if a given surface form matches to a time form. It should be something like 3:34, 12:56 etc.
* @param surfaceForm Surface form to be checked.
* @return True if the surface form is in time form
*/
private boolean isTime(String surfaceForm) {
return patternMatches("(\\d\\d|\\d):(\\d\\d|\\d):(\\d\\d|\\d)", surfaceForm) || patternMatches("(\\d\\d|\\d):(\\d\\d|\\d)", surfaceForm);
}

/**
* Checks if a given surface form matches to a range form. It should be something like 123-1400 or 12:34-15:78 or
* 3.45-4.67.
* @param surfaceForm Surface form to be checked.
* @return True if the surface form is in range form
*/
private boolean isRange(String surfaceForm) {
return patternMatches("\\d+-\\d+", surfaceForm) || patternMatches("(\\d\\d|\\d):(\\d\\d|\\d)-(\\d\\d|\\d):(\\d\\d|\\d)", surfaceForm) || patternMatches("(\\d\\d|\\d)\\.(\\d\\d|\\d)-(\\d\\d|\\d)\\.(\\d\\d|\\d)", surfaceForm);
}

/**
* Checks if a given surface form matches to a date form. It should be something like 3/10/2023 or 2.3.2012
* @param surfaceForm Surface form to be checked.
* @return True if the surface form is in date form
*/
private boolean isDate(String surfaceForm) {
return patternMatches("(\\d\\d|\\d)/(\\d\\d|\\d)/\\d+", surfaceForm) || patternMatches("(\\d\\d|\\d)\\.(\\d\\d|\\d)\\.\\d+", surfaceForm);
}
Expand Down
105 changes: 101 additions & 4 deletions src/main/java/MorphologicalAnalysis/MorphologicalParse.java
Original file line number Diff line number Diff line change
Expand Up @@ -571,6 +571,12 @@ public String getTreePos(){
return "-XXX-";
}

/**
* Returns the pronoun type of the parse for universal dependency feature ProType.
* @return "Art" if the pronoun is also a determiner; "Prs" if the pronoun is personal pronoun; "Rcp" if the
* pronoun is 'birbiri'; "Ind" if the pronoun is an indeterminate pronoun; "Neg" if the pronoun is 'hiçbiri';
* "Int" if the pronoun is a question pronoun; "Dem" if the pronoun is a demonstrative pronoun.
*/
private String getPronType(){
String lemma = root.getName();
if (containsTag(MorphologicalTag.DETERMINER)){
Expand Down Expand Up @@ -604,6 +610,11 @@ private String getPronType(){
return null;
}

/**
* Returns the numeral type of the parse for universal dependency feature NumType.
* @return "Ord" if the parse is Time, Ordinal or the word is '%' or 'kaçıncı'; "Dist" if the word is a
* distributive number such as 'beşinci'; "Card" if the number is cardinal or any number or the word is 'kaç'.
*/
private String getNumType(){
String lemma = root.getName();
if (lemma.equals("%") || containsTag(MorphologicalTag.TIME)){
Expand All @@ -621,6 +632,10 @@ private String getNumType(){
return null;
}

/**
* Returns the value for the dependency feature Reflex.
* @return "Yes" if the root word is 'kendi', null otherwise.
*/
private String getReflex(){
String lemma = root.getName();
if (lemma.equals("kendi")){
Expand All @@ -629,6 +644,11 @@ private String getReflex(){
return null;
}

/**
* Returns the agreement of the parse for the universal dependency feature Number.
* @return "Sing" if the agreement of the parse is singular (contains A1SG, A2SG, A3SG); "Plur" if the agreement
* of the parse is plural (contains A1PL, A2PL, A3PL).
*/
private String getNumber(){
if (containsTag(MorphologicalTag.A1SG) || containsTag(MorphologicalTag.A2SG) || containsTag(MorphologicalTag.A3SG)){
return "Sing";
Expand All @@ -639,6 +659,11 @@ private String getNumber(){
return null;
}

/**
* Returns the possessive agreement of the parse for the universal dependency feature [Pos].
* @return "Sing" if the possessive agreement of the parse is singular (contains P1SG, P2SG, P3SG); "Plur" if the
* possessive agreement of the parse is plural (contains P1PL, P2PL, P3PL).
*/
private String getPossessiveNumber(){
if (containsTag(MorphologicalTag.P1SG) || containsTag(MorphologicalTag.P2SG) || containsTag(MorphologicalTag.P3SG)){
return "Sing";
Expand All @@ -649,6 +674,11 @@ private String getPossessiveNumber(){
return null;
}

/**
* Returns the case marking of the parse for the universal dependency feature case.
* @return "Acc" for accusative marker; "Dat" for dative marker; "Gen" for genitive marker; "Loc" for locative
* marker; "Ins" for instrumentative marker; "Abl" for ablative marker; "Nom" for nominative marker.
*/
private String getCase(){
if (containsTag(MorphologicalTag.ACCUSATIVE) || containsTag(MorphologicalTag.PCACCUSATIVE)){
return "Acc";
Expand All @@ -674,6 +704,11 @@ private String getCase(){
return null;
}

/**
* Returns the definiteness of the parse for the universal dependency feature definite. It applies only for
* determiners in Turkish.
* @return "Ind" for 'bir', 'bazı', or 'birkaç'. "Def" for 'her', 'bu', 'şu', 'o', 'bütün'.
*/
private String getDefinite(){
String lemma = root.getName();
if (containsTag(MorphologicalTag.DETERMINER)){
Expand All @@ -687,6 +722,10 @@ private String getDefinite(){
return null;
}

/**
* Returns the degree of the parse for the universal dependency feature degree.
* @return "Cmp" for comparative adverb 'daha'; "Sup" for superlative adjective or adverb 'en'.
*/
private String getDegree(){
String lemma = root.getName();
if (lemma.equals("daha")){
Expand All @@ -698,6 +737,10 @@ private String getDegree(){
return null;
}

/**
* Returns the polarity of the verb for the universal dependency feature polarity.
* @return "Pos" for positive polarity containing tag POS; "Neg" for negative polarity containing tag NEG.
*/
private String getPolarity(){
if (containsTag(MorphologicalTag.POSITIVE)){
return "Pos";
Expand All @@ -708,6 +751,10 @@ private String getPolarity(){
return null;
}

/**
* Returns the person of the agreement of the parse for the universal dependency feature person.
* @return "1" for first person; "2" for second person; "3" for third person.
*/
private String getPerson(){
if (containsTag(MorphologicalTag.A1SG) || containsTag(MorphologicalTag.A1PL)){
return "1";
Expand All @@ -721,6 +768,10 @@ private String getPerson(){
return null;
}

/**
* Returns the person of the possessive agreement of the parse for the universal dependency feature [pos].
* @return "1" for first person; "2" for second person; "3" for third person.
*/
private String getPossessivePerson(){
if (containsTag(MorphologicalTag.P1SG) || containsTag(MorphologicalTag.P1PL)){
return "1";
Expand All @@ -734,6 +785,12 @@ private String getPossessivePerson(){
return null;
}

/**
* Returns the voice of the verb parse for the universal dependency feature voice.
* @return "CauPass" if the verb parse is both causative and passive; "Pass" if the verb parse is only passive;
* "Rcp" if the verb parse is reciprocal; "Cau" if the verb parse is only causative; "Rfl" if the verb parse is
* reflexive.
*/
private String getVoice(){
if (containsTag(MorphologicalTag.CAUSATIVE) && containsTag(MorphologicalTag.PASSIVE)){
return "CauPass";
Expand All @@ -753,6 +810,11 @@ private String getVoice(){
return null;
}

/**
* Returns the aspect of the verb parse for the universal dependency feature aspect.
* @return "Perf" for past, narrative and future tenses; "Prog" for progressive tenses; "Hab" for Aorist; "Rapid"
* for parses containing HASTILY tag; "Dur" for parses containing START, STAY or REPEAT tags.
*/
private String getAspect(){
if (containsTag(MorphologicalTag.PASTTENSE) || containsTag(MorphologicalTag.NARRATIVE) || containsTag(MorphologicalTag.FUTURE)){
return "Perf";
Expand All @@ -772,6 +834,11 @@ private String getAspect(){
return null;
}

/**
* Returns the tense of the verb parse for universal dependency feature tense.
* @return "Past" for simple past tense; "Fut" for future tense; "Pqp" for narrative past tense; "Pres" for other
* past tenses.
*/
private String getTense(){
if (containsTag(MorphologicalTag.PASTTENSE)){
return "Past";
Expand All @@ -788,22 +855,35 @@ private String getTense(){
return null;
}

/**
* Returns the modality of the verb parse for the universal dependency feature mood.
* @return "GenNecPot" if both necessitative and potential is combined with a suffix of general modality;
* "CndGenPot" if both conditional and potential is combined with a suffix of general modality;
* "GenNec" if necessitative is combined with a suffix of general modality;
* "GenPot" if potential is combined with a suffix of general modality;
* "NecPot" if necessitative is combined with potential;
* "DesPot" if desiderative is combined with potential;
* "CndPot" if conditional is combined with potential;
* "CndGen" if conditional is combined with a suffix of general modality;
* "Imp" for imperative; "Cnd" for simple conditional; "Des" for simple desiderative; "Opt" for optative; "Nec" for
* simple necessitative; "Pot" for simple potential; "Gen" for simple suffix of a general modality.
*/
private String getMood(){
if ((containsTag(MorphologicalTag.COPULA) || containsTag(MorphologicalTag.AORIST)) && containsTag(MorphologicalTag.NECESSITY) && containsTag(MorphologicalTag.ABLE)){
return "GenNecPot";
}
if (containsTag(MorphologicalTag.CONDITIONAL) && (containsTag(MorphologicalTag.COPULA) || containsTag(MorphologicalTag.AORIST)) && containsTag(MorphologicalTag.ABLE)){
if ((containsTag(MorphologicalTag.COPULA) || containsTag(MorphologicalTag.AORIST)) && containsTag(MorphologicalTag.CONDITIONAL) && containsTag(MorphologicalTag.ABLE)){
return "CndGenPot";
}
if ((containsTag(MorphologicalTag.COPULA) || containsTag(MorphologicalTag.AORIST)) && containsTag(MorphologicalTag.NECESSITY)){
return "GenNec";
}
if (containsTag(MorphologicalTag.NECESSITY) && containsTag(MorphologicalTag.ABLE)){
return "NecPot";
}
if ((containsTag(MorphologicalTag.COPULA) || containsTag(MorphologicalTag.AORIST)) && containsTag(MorphologicalTag.ABLE)){
return "GenPot";
}
if (containsTag(MorphologicalTag.NECESSITY) && containsTag(MorphologicalTag.ABLE)){
return "NecPot";
}
if (containsTag(MorphologicalTag.DESIRE) && containsTag(MorphologicalTag.ABLE)){
return "DesPot";
}
Expand Down Expand Up @@ -843,6 +923,11 @@ private String getMood(){
return null;
}

/**
* Returns the form of the verb parse for the universal dependency feature verbForm.
* @return "Part" for participles; "Vnoun" for infinitives; "Conv" for parses contaning tags SINCEDOINGSO,
* WITHOUTHAVINGDONESO, WITHOUTBEINGABLETOHAVEDONESO, BYDOINGSO, AFTERDOINGSO, INFINITIVE3; "Fin" for others.
*/
private String getVerbForm(){
if (containsTag(MorphologicalTag.PASTPARTICIPLE) || containsTag(MorphologicalTag.FUTUREPARTICIPLE) || containsTag(MorphologicalTag.PRESENTPARTICIPLE)){
return "Part";
Expand All @@ -862,6 +947,12 @@ private String getVerbForm(){
return null;
}

/**
* Construct the universal dependency features as an array of strings. Each element represents a single feature.
* Every feature is given as featureType = featureValue.
* @param uPos Universal dependency part of speech tag for the parse.
* @return An array of universal dependency features for this parse.
*/
public ArrayList<String> getUniversalDependencyFeatures(String uPos){
ArrayList<String> featureList = new ArrayList<>();
String pronType = getPronType();
Expand Down Expand Up @@ -940,6 +1031,12 @@ public ArrayList<String> getUniversalDependencyFeatures(String uPos){
return featureList;
}

/**
* Returns the universal dependency part of speech for this parse.
* @return "AUX" for word 'değil; "PROPN" for proper nouns; "NOUN for nouns; "ADJ" for adjectives; "ADV" for
* adverbs; "INTJ" for interjections; "VERB" for verbs; "PUNCT" for punctuation symbols; "DET" for determiners;
* "NUM" for numerals; "PRON" for pronouns; "ADP" for post participles; "SCONJ" or "CCONJ" for conjunctions.
*/
public String getUniversalDependencyPos(){
String lemma = root.getName();
if (lemma.equals("değil")){
Expand Down
Loading

0 comments on commit 083d055

Please sign in to comment.