From b61e9b203de85ebb622980aea621280318b5c46e Mon Sep 17 00:00:00 2001 From: BLKSerene Date: Mon, 4 Nov 2024 14:26:37 +0800 Subject: [PATCH] Settings: Add Settings - Measures - Effect Size - Mutual Information / Pointwise Mutual Information / Pointwise Mutual Information (Cubic) / Pointwise Mutual Information (Squared) --- CHANGELOG.md | 1 + doc/doc.md | 54 ++++----- doc/measures/effect_size/im2.svg | 63 +++++----- doc/measures/effect_size/im3.svg | 66 ++++++----- doc/measures/effect_size/mi.svg | 86 +++++++------- doc/measures/effect_size/pmi.svg | 64 +++++----- wordless/wl_measures/wl_measure_utils.py | 6 + .../wl_measures/wl_measures_effect_size.py | 48 ++++++-- wordless/wl_settings/wl_settings_default.py | 16 +++ wordless/wl_settings/wl_settings_measures.py | 110 +++++++++++++++++- 10 files changed, 352 insertions(+), 162 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5c57e3373..ea6e06700 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ ## [3.6.0](https://github.com/BLKSerene/Wordless/releases/tag/3.6.0) - ??/??/2024 ### 🎉 New Features - Measures: Add effect size - conditional probability / ΔP / squared association ratio +- Settings: Add Settings - Measures - Effect Size - Mutual Information / Pointwise Mutual Information / Pointwise Mutual Information (Cubic) / Pointwise Mutual Information (Squared) - Utils: Add Stanza's Sindhi dependency parser ### 📌 Bugfixes diff --git a/doc/doc.md b/doc/doc.md index 04127edfa..19810670c 100644 --- a/doc/doc.md +++ b/doc/doc.md @@ -1139,7 +1139,7 @@ Readability Formula|Formula|Supported Languages Al-Heeti's readability formula¹
([Al-Heeti, 1984, pp. 102, 104, 106](#ref-al-heeti-1984))|![Formula](/doc/measures/readability/rd.svg)|**Arabic** Automated Arabic Readability Index
([Al-Tamimi et al., 2013](#ref-al-tamimi-et-al-2013))|![Formula](/doc/measures/readability/aari.svg)|**Arabic** Automated Readability Index¹
([Smith & Senter, 1967, p. 8](#ref-smith-senter-1967)
Navy: [Kincaid et al., 1975, p. 14](#ref-kincaid-et-al-1975))|![Formula](/doc/measures/readability/ari.svg)|All languages -Bormuth's cloze mean & grade placement
([Bormuth, 1969, pp. 152, 160](#ref-bormuth-1969))|![Formula](/doc/measures/readability/bormuths_cloze_mean_gp.svg)
where **C** is the cloze criterion score, whose value could be changed via **Menu Bar → Preferences → Settings → Measures → Readability → Bormuth's Grade Placement → Cloze criterion score**|**English** +Bormuth's cloze mean & grade placement
([Bormuth, 1969, pp. 152, 160](#ref-bormuth-1969))|![Formula](/doc/measures/readability/bormuths_cloze_mean_gp.svg)
where **C** is the cloze criterion score, whose value could be modified via **Menu Bar → Preferences → Settings → Measures → Readability → Bormuth's Grade Placement → Cloze criterion score**|**English** Coleman-Liau index
([Coleman & Liau, 1975](#ref-coleman-liau-1975))|![Formula](/doc/measures/readability/coleman_liau_index.svg)|All languages Coleman's readability formula¹
([Liau et al., 1976](#ref-liau-et-al-1976))|![Formula](/doc/measures/readability/colemans_readability_formula.svg)|All languages²³ Crawford's readability formula
([Crawford, 1985](#ref-crawford-1985))|![Formula](/doc/measures/readability/crawfords_readability_formula.svg)|**Spanish**² @@ -1153,25 +1153,25 @@ Readability Formula|Formula|Supported Languages Flesch-Kincaid grade level
([Kincaid et al., 1975, p. 14](#ref-kincaid-et-al-1975))|![Formula](/doc/measures/readability/gl.svg)|All languages² Flesch reading ease¹
([Flesch, 1948](#ref-flesch-1948)
Powers-Sumner-Kearl: [Powers et al., 1958](#ref-powers-et-al-1958)
Dutch: [Douma, 1960, p. 453](#ref-douma-1960); [Brouwer, 1963](#ref-brouwer-1963)
French: [Kandel & Moles, 1958](#ref-kandel-moles-1958)
German: [Amstad, 1978](#ref-amstad-1978)
Italian: [Franchina & Vacca, 1986](#ref-franchina-vacca-1986)
Russian: [Oborneva, 2006, p. 13](#ref-oborneva-2006)
Spanish: [Fernández Huerta, 1959](#ref-fernandez-huerta-1959); [Szigriszt Pazos, 1993, p. 247](#ref-szigrisze-pazos-1993)
Ukrainian: [Partiko, 2001](#ref-partiko-2001))|![Formula](/doc/measures/readability/re.svg)|All languages² Flesch reading ease (Farr-Jenkins-Paterson)¹
([Farr et al., 1951](#ref-farr-et-al-1951)
Powers-Sumner-Kearl: [Powers et al., 1958](#ref-powers-et-al-1958))|![Formula](/doc/measures/readability/re_farr_jenkins_paterson.svg)|All languages² -FORCAST
([Caylor & Sticht, 1973, p. 3](#ref-caylor-sticht-1973))|![Formula](/doc/measures/readability/rgl.svg)

* **One sample of 150 words** would be taken randomly from the text, so the text should be **at least 150 words long**.|All languages² +FORCAST
([Caylor & Sticht, 1973, p. 3](#ref-caylor-sticht-1973))|![Formula](/doc/measures/readability/rgl.svg)

* **A 150-word-long sample** would be taken randomly from the text, so the text should be **at least 150 words long**.|All languages² Fucks's Stilcharakteristik
([Fucks, 1955](#ref-fucks-1955))|![Formula](/doc/measures/readability/fuckss_stilcharakteristik.svg)|All languages² GULPEASE
([Lucisano & Emanuela Piemontese, 1988](#ref-lucisano-emanuela-piemontese-1988))|![Formula](/doc/measures/readability/gulpease.svg)|**Italian** Gunning Fog Index¹
(English: [Gunning, 1968, p. 38](#ref-gunning-1968)
Powers-Sumner-Kearl: [Powers et al., 1958](#ref-powers-et-al-1958)
Navy: [Kincaid et al., 1975, p. 14](#ref-kincaid-et-al-1975)
Polish: [Pisarek, 1969](#ref-pisarek-1969))|![Formula](/doc/measures/readability/fog_index.svg)
where **NumHardWords** is the number of words with 3 or more syllables, except proper nouns and words with 3 syllables ending with *-ed* or *-es*, for **English texts**, and the number of words with 4 or more syllables in their base forms, except proper nouns, for **Polish texts**.|**English & Polish**² Gutiérrez de Polini's readability formula
([Gutiérrez de Polini, 1972](#ref-gutierrez-de-polini-1972))|![Formula](/doc/measures/readability/cp.svg)|**Spanish** Legibilidad µ
([Muñoz Baquedano, 2006](#ref-munoz-baquedano-2006))|![Formula](/doc/measures/readability/mu.svg)
where **LenWordsAvg** is the average word length in letters, and **LenWordsVar** is the variance of word lengths in letters.|**Spanish** -Lensear Write Formula
([O’Hayre, 1966, p. 8](#ref-o-hayre-1966))|![Formula](/doc/measures/readability/lensear_write_formula.svg)
where **NumWords1Syl** is the number of monosyllabic words excluding *the*, *is*, *are*, *was*, *were*.

* **One sample of 100 words** would be taken randomly from the text, and if the text is **shorter than 100 words**, **NumWords1Syl** and **NumSentences** would be multiplied by 100 and then divided by **NumWords**.|**English**² +Lensear Write Formula
([O’Hayre, 1966, p. 8](#ref-o-hayre-1966))|![Formula](/doc/measures/readability/lensear_write_formula.svg)
where **NumWords1Syl** is the number of monosyllabic words excluding *the*, *is*, *are*, *was*, *were*.

* **A 100-word-long sample** would be taken randomly from the text. If the text is **shorter than 100 words**, **NumWords1Syl** and **NumSentences** would be multiplied by 100 and then divided by **NumWords**.|**English**² Lix
([Björnsson, 1968](#ref-bjornsson-1968))|![Formula](/doc/measures/readability/lix.svg)|All languages Lorge Readability Index¹
([Lorge, 1944](#ref-lorge-1944)
Corrected: [Lorge, 1948](#ref-lorge-1948))|![Formula](/doc/measures/readability/lorge_readability_index.svg)|**English**³ -Luong-Nguyen-Dinh's readability formula
([Luong et al., 2018](#ref-luong-et-al-2018))|![Formula](/doc/measures/readability/luong_nguyen_dinhs_readability_formula.svg)

* The number of syllables is estimated by tokenizing the text by whitespace and counting the number of tokens excluding punctuation marks|**Vietnamese** +Luong-Nguyen-Dinh's readability formula
([Luong et al., 2018](#ref-luong-et-al-2018))|![Formula](/doc/measures/readability/luong_nguyen_dinhs_readability_formula.svg)

* The number of syllables is estimated by tokenizing the text by whitespace and counting the number of tokens excluding punctuation marks.|**Vietnamese** McAlpine EFLAW Readability Score
([McAlpine, 2006](#ref-mcalpine-2006))|![Formula](/doc/measures/readability/eflaw.svg)|**English** neue Wiener Literaturformeln¹
([Bamberger & Vanecek, 1984, p. 82](#ref-bamberger-vanecek-1984))|![Formula](/doc/measures/readability/nwl.svg)|**German**² neue Wiener Sachtextformel¹
([Bamberger & Vanecek, 1984, pp. 83–84](#ref-bamberger-vanecek-1984))|![Formula](/doc/measures/readability/nws.svg)|**German**² OSMAN
([El-Haj & Rayson, 2016](#ref-elhaj-rayson-2016))|![Formula](/doc/measures/readability/osman.svg)
where **NumFaseehWords** is the number of words which have 5 or more syllables and contain ء/ئ/ؤ/ذ/ظ or end with وا/ون.

* The number of syllables in each word is estimated by adding up the number of short syllables and twice the number of long and stress syllables in each word.|**Arabic** Rix
([Anderson, 1983](#ref-anderson-1983))|![Formula](/doc/measures/readability/rix.svg)|All languages -SMOG Grading
([McLaughlin, 1969](#ref-mclaughlin-1969)
German: [Bamberger & Vanecek, 1984, p.78](#ref-bamberger-vanecek-1984))|![Formula](/doc/measures/readability/smog_grading.svg)

* A sample would be constructed using **the first 10 sentences, the last 10 sentences, and the 10 sentences at the middle of the text**, so the text should be **at least 30 sentences long**.|All languages² -Spache readability formula¹
([Spache, 1953](#ref-spache-1953)
Revised: [Spache, 1974](#ref-spache-1974))|![Formula](/doc/measures/readability/spache_readability_formula.svg)

* **Three samples each of 100 words** would be taken randomly from the text and the results would be averaged out, so the text should be **at least 100 words long**.|English -Strain Index
([Nathaniel, 2017](#ref-nathaniel-2017))|![Formula](/doc/measures/readability/strain_index.svg)

* A sample would be constructed using **the first 3 sentences in the text**, so the text should be **at least 3 sentences long**.|All languages² -Tränkle-Bailer's readability formula¹
([Tränkle & Bailer, 1984](#ref-trankle-bailer-1984))|![Formula](/doc/measures/readability/trankle_bailers_readability_formula.svg)

* **One sample of 100 words** would be taken randomly from the text, so the text should be **at least 100 words long**.|All languages³ +SMOG Grading
([McLaughlin, 1969](#ref-mclaughlin-1969)
German: [Bamberger & Vanecek, 1984, p.78](#ref-bamberger-vanecek-1984))|![Formula](/doc/measures/readability/smog_grading.svg)

* A sample consisting of **the first 10 sentences, the last 10 sentences, and the 10 sentences at the middle of the text** would be taken from the text, so the text should be **at least 30 sentences long**.|All languages² +Spache readability formula¹
([Spache, 1953](#ref-spache-1953)
Revised: [Spache, 1974](#ref-spache-1974))|![Formula](/doc/measures/readability/spache_readability_formula.svg)

* **Three 100-word-long samples** would be taken randomly from the text and the results would be averaged out, so the text should be **at least 100 words long**.|English +Strain Index
([Nathaniel, 2017](#ref-nathaniel-2017))|![Formula](/doc/measures/readability/strain_index.svg)

* A sample consisting of **the first 3 sentences of the text** would be taken from the text, so the text should be **at least 3 sentences long**.|All languages² +Tränkle-Bailer's readability formula¹
([Tränkle & Bailer, 1984](#ref-trankle-bailer-1984))|![Formula](/doc/measures/readability/trankle_bailers_readability_formula.svg)

* **A 100-word-long sample** would be taken randomly from the text, so the text should be **at least 100 words long**.|All languages³ Tuldava's readability formula
([Tuldava, 1975](#ref-tuldava-1975))|![Formula](/doc/measures/readability/td.svg)|All languages² Wheeler-Smith's readability formula
([Wheeler & Smith, 1954](#ref-wheeler-smith-1954))|![Formula](/doc/measures/readability/wheeler_smiths_readability_formula.svg)
where **NumUnits** is the number of sentence segments ending in periods, question marks, exclamation marks, colons, semicolons, and dashes.|All languages² @@ -1269,14 +1269,14 @@ Indicator of Lexical Density/Diversity|Formula --------------------------------------|------- Brunét's index
([Brunét, 1978](#ref-brunet-1978))|![Formula](/doc/measures/lexical_density_diversity/brunets_index.svg) Corrected TTR
([Carroll, 1964](#ref-carroll-1964))|![Formula](/doc/measures/lexical_density_diversity/cttr.svg) -Fisher's Index of Diversity
([Fisher et al., 1943](#ref-fisher-et-al-1943))|![Formula](/doc/measures/lexical_density_diversity/fishers_index_of_diversity.svg)
where *W*₋₁ is the -1 branch of the [Lambert W function](https://en.wikipedia.org/wiki/Lambert_W_function) +Fisher's Index of Diversity
([Fisher et al., 1943](#ref-fisher-et-al-1943))|![Formula](/doc/measures/lexical_density_diversity/fishers_index_of_diversity.svg)
where *W*₋₁ is the -1 branch of the [Lambert W function](https://en.wikipedia.org/wiki/Lambert_W_function). Herdan's vₘ
([Herdan, 1955](#ref-herdan-1955))|![Formula](/doc/measures/lexical_density_diversity/herdans_vm.svg) -HD-D
([McCarthy & Jarvis, 2010](#ref-mccarthy-jarvis-2010))|For detailed calculation procedures, see reference.
The sample size could be modified via **Menu Bar → Preferences → Settings → Measures → Lexical Density/Diversity → HD-D → Sample size**. +HD-D
([McCarthy & Jarvis, 2010](#ref-mccarthy-jarvis-2010))|For detailed calculation procedures, see reference.

The sample size could be modified via **Menu Bar → Preferences → Settings → Measures → Lexical Density/Diversity → HD-D → Sample size**. Honoré's statistic
([Honoré, 1979](#ref-honore-1979))|![Formula](/doc/measures/lexical_density_diversity/honores_stat.svg) -Lexical density
([Halliday, 1989, p. 64](#ref-halliday-1989))|![Formula](/doc/measures/lexical_density_diversity/lexical_density.svg)
where **NumContentWords** is the number of content words. By default, all tokens whose universal part-of-speech tags assigned by built-in part-of-speech taggers are ADJ (adjectives), ADV (adverbs), INTJ (interjections), NOUN (nouns), PROPN (proper nouns), NUM (numerals), VERB (verbs), SYM (symbols), or X (others) are categorized as content words. For some built-in part-of-speech taggers, this behavior could be changed via **Menu Bar → Preferences → Settings → Part-of-speech Tagging → Tagsets → Mapping Settings → Content/Function Words**. +Lexical density
([Halliday, 1989, p. 64](#ref-halliday-1989))|![Formula](/doc/measures/lexical_density_diversity/lexical_density.svg)
where **NumContentWords** is the number of content words. By default, all tokens whose universal part-of-speech tags assigned by built-in part-of-speech taggers are ADJ (adjectives), ADV (adverbs), INTJ (interjections), NOUN (nouns), PROPN (proper nouns), NUM (numerals), VERB (verbs), SYM (symbols), or X (others) are categorized as content words. For some built-in part-of-speech taggers, this behavior could be modified via **Menu Bar → Preferences → Settings → Part-of-speech Tagging → Tagsets → Mapping Settings → Content/Function Words**. LogTTR¹
(Herdan: [Herdan, 1960, p. 28](#ref-herdan-1960)
Somers: [Somers, 1966](#ref-somers-1966)
Rubet: [Dugast, 1979](#ref-dugast-1979)
Maas: [Maas, 1972](#ref-maas-1972)
Dugast: [Dugast, 1978](#ref-dugast-1978); [Dugast, 1979](#ref-dugast-1979))|![Formula](/doc/measures/lexical_density_diversity/logttr.svg) Mean segmental TTR
([Johnson, 1944](#ref-johnson-1944))|![Formula](/doc/measures/lexical_density_diversity/msttr.svg)
where **n** is the number of equal-sized segment, the length of which could be modified via **Menu Bar → Preferences → Settings → Measures → Lexical Density/Diversity → Mean Segmental TTR → Number of tokens in each segment**, **NumTypesSegᵢ** is the number of token types in the **i**-th segment, and **NumTokensSegᵢ** is the number of tokens in the **i**-th segment. -Measure of textual lexical diversity
([McCarthy, 2005, pp. 95–96, 99–100](#ref-mccarthy-2005); [McCarthy & Jarvis, 2010](#ref-mccarthy-jarvis-2010))|For detailed calculation procedures, see references.
The factor size could be modified via **Menu Bar → Preferences → Settings → Measures → Lexical Density/Diversity → Measure of Textual Lexical Diversity → Factor size**. +Measure of textual lexical diversity
([McCarthy, 2005, pp. 95–96, 99–100](#ref-mccarthy-2005); [McCarthy & Jarvis, 2010](#ref-mccarthy-jarvis-2010))|For detailed calculation procedures, see references.

The factor size could be modified via **Menu Bar → Preferences → Settings → Measures → Lexical Density/Diversity → Measure of Textual Lexical Diversity → Factor size**. Moving-average TTR
([Covington & McFall, 2010](#ref-covington-mcfall-2010))|![Formula](/doc/measures/lexical_density_diversity/mattr.svg)
where **w** is the window size which could be modified via **Menu Bar → Preferences → Settings → Measures → Lexical Density/Diversity → Moving-average TTR → Window size**, **NumTypesWindowₚ** is the number of token types within the moving window starting at position **p**, and **NumTokensWindowₚ** is the number of tokens within the moving window starting at position **p**. Popescu-Mačutek-Altmann's B₁/B₂/B₃/B₄/B₅
([Popescu et al., 2008](#ref-popescu-et-al-2008))|![Formula](/doc/measures/lexical_density_diversity/popescu_macutek_altmanns_b1_b2_b3_b4_b5.svg) Popescu's R₁
([Popescu, 2009, pp. 18, 30, 33](#ref-popescu-2009))|For detailed calculation procedures, see reference. @@ -1367,9 +1367,9 @@ Measure of Dispersion (Parts-based)|Measure of Adjusted Frequency (Parts-based)| -----------------------------------|-------------------------------------------|------- Carroll's D₂
([Carroll, 1970](#ref-carroll-1970))|Carroll's Uₘ
([Carroll, 1970](#ref-carroll-1970))|![Formula](/doc/measures/dispersion_adjusted_frequency/carrolls_um.svg)  |Engwall's FM
([Engwall, 1974, p. 53](#ref-engwall-1974))|![Formula](/doc/measures/dispersion_adjusted_frequency/engwalls_fm.svg)
where **R** is the number of sub-sections in which the word appears at least once. -Gries's DP
([Gries, 2008](#ref-gries-2008); [Lijffijt & Gries, 2012](#ref-lijffijt-gries-2012))||![Formula](/doc/measures/dispersion_adjusted_frequency/griess_dp.svg)

* Normalization is applied by default, which behavior you could change via **Menu Bar → Preferences → Settings → Measures → Dispersion → Gries's DP → Apply normalization**. +Gries's DP
([Gries, 2008](#ref-gries-2008); [Lijffijt & Gries, 2012](#ref-lijffijt-gries-2012))||![Formula](/doc/measures/dispersion_adjusted_frequency/griess_dp.svg)

* Normalization is applied by default, which behavior could be modified via **Menu Bar → Preferences → Settings → Measures → Dispersion → Gries's DP → Apply normalization**. Juilland's D
([Juilland & Chang-Rodrigues, 1964, p. LIII](#ref-juilland-chang-rodrigues-1964))|Juilland's U
([Juilland & Chang-Rodrigues, 1964, p. LXVIII](#ref-juilland-chang-rodrigues-1964))|![Formula](/doc/measures/dispersion_adjusted_frequency/juillands_u.svg) - |Kromer's UR
([Kromer, 2003](#ref-kromer-2003))|![Formula](/doc/measures/dispersion_adjusted_frequency/kromers_ur.svg)
where **ψ** is the [digamma function](https://en.wikipedia.org/wiki/Digamma_function), and **C** is the [Euler–Mascheroni constant](https://en.wikipedia.org/wiki/Euler%E2%80%93Mascheroni_constant). + |Kromer's UR
([Kromer, 2003](#ref-kromer-2003))|![Formula](/doc/measures/dispersion_adjusted_frequency/kromers_ur.svg)
where **ψ** is the [digamma function](https://en.wikipedia.org/wiki/Digamma_function) and **C** is the [Euler–Mascheroni constant](https://en.wikipedia.org/wiki/Euler%E2%80%93Mascheroni_constant). Lyne's D₃
([Lyne, 1985](#ref-lyne-1985))||![Formula](/doc/measures/dispersion_adjusted_frequency/lynes_d3.svg) Rosengren's S
([Rosengren, 1971](#ref-rosengren-1971))|Rosengren's KF
([Rosengren, 1971](#ref-rosengren-1971))|![Formula](/doc/measures/dispersion_adjusted_frequency/rosengrens_s.svg) Zhang's Distributional Consistency
([Zhang, 2004](#ref-zhang-2004))||![Formula](/doc/measures/dispersion_adjusted_frequency/zhangs_distributional_consistency.svg) @@ -1474,9 +1474,6 @@ Test of Statistical Significance|Measure of Bayes Factor|Formula|Collocation Ext Conditional probability: \text{P} = \frac{O_{11}}{O_{x1}} \times 100 -Cubic association ratio: - \text{IM}^3 = \log_{2} \frac{{O_{11}}^3}{E_{11}} - ΔP: \Delta\text{P} = \frac{O_{11}}{O_{x1}} - \frac{O_{12}}{O_{x2}} @@ -1508,7 +1505,7 @@ Mutual Expectation: \text{ME} = O_{11} \times \frac{2 \times O_{11}}{O_{1x} + O_{x1}} Mutual information: - \text{MI} = \sum_{i = 1}^n \sum_{j = 1}^n \left(\frac{O_{ij}}{O_{xx}} \times \log_{2} \frac{O_{ij}}{E_{ij}}\right) + \text{MI} = \sum_{i = 1}^n \sum_{j = 1}^n \left(\frac{O_{ij}}{O_{xx}} \times \log_{base} \frac{O_{ij}}{E_{ij}}\right) Odds ratio: \text{Odds ratio} = \frac{O_{11} \times O_{22}}{O_{12} \times O_{21}} @@ -1517,14 +1514,17 @@ Odds ratio: \text{%DIFF} = \frac{\left(\frac{O_{11}}{O_{x1}} - \frac{O_{12}}{O_{x2}}\right) \times 100}{\frac{O_{12}}{O_{x2}}} Pointwise mutual information: - \text{PMI} = \log_{2} \frac{O_{11}}{E_{11}} + \text{PMI} = \log_{base} \frac{O_{11}}{E_{11}} + +Pointwise mutual information (cubic): + \text{IM}^3 = \log_{base} \frac{{O_{11}}^3}{E_{11}} + +Pointwise mutual information (squared): + \text{IM}^2 = \log_{base} \frac{{O_{11}}^2}{E_{11}} Poisson collocation measure: \text{sig} = \frac{O_{11} \times (\ln O_{11} - \ln E_{11} - 1)}{\ln O_{xx}} -Squared association ratio: - \text{IM}^2 = \log_{2} \frac{{O_{11}}^2}{E_{11}} - Squared phi coefficient: \phi^2 = \frac{(O_{11} \times O_{22} - O_{12} \times O_{21})^2}{O_{1x} \times O_{2x} \times O_{x1} \times O_{x2}} --> @@ -1536,18 +1536,18 @@ Measure of Effect Size|Formula|Collocation Extraction|Keyword Extraction Dice-Sørensen coefficient
([Smadja et al., 1996, p. 8](#ref-smadja-et-al-1996))|![Formula](/doc/measures/effect_size/dice_sorensen_coeff.svg)|✔|✖️ Difference coefficient
([Hofland & Johansson, 1982, p. 14](#ref-hofland-johansson-1982); [Gabrielatos, 2018, p. 236](#ref-gabrielatos-2018))|![Formula](/doc/measures/effect_size/diff_coeff.svg)|✖️|✔ Jaccard index
([Dunning, 1998, p. 48](#ref-dunning-1998))|![Formula](/doc/measures/effect_size/jaccard_index.svg)|✔|✖️ -Kilgarriff's ratio
([Kilgarriff, 2009](#ref-kilgarriff-2009))|![Formula](/doc/measures/effect_size/kilgarriffs_ratio.svg)
where **α** is the smoothing parameter, whose value could be changed via **Menu Bar → Preferences → Settings → Measures → Effect Size → Kilgarriff's Ratio → Smoothing Parameter**.|✖️|✔ +Kilgarriff's ratio
([Kilgarriff, 2009](#ref-kilgarriff-2009))|![Formula](/doc/measures/effect_size/kilgarriffs_ratio.svg)
where **α** is the smoothing parameter, whose value could be modified via **Menu Bar → Preferences → Settings → Measures → Effect Size → Kilgarriff's Ratio → Smoothing parameter**.|✖️|✔ logDice
([Rychlý, 2008, p. 9](#ref-rychly-2008))|![Formula](/doc/measures/effect_size/log_dice.svg)|✔|✖️ Log Ratio
([Hardie, 2014](#ref-hardie-2014))|![Formula](/doc/measures/effect_size/log_ratio.svg)|✔|✔ MI.log-f
([Kilgarriff & Tugwell, 2002](#ref-kilgarriff-tugwell-2002); [Lexical Computing Ltd., 2015, p. 4](#ref-lexical-computing-ltd-2015))|![Formula](/doc/measures/effect_size/mi_log_f.svg)|✔|✖️ Minimum sensitivity
([Pedersen, 1998](#ref-pedersen-1998))|![Formula](/doc/measures/effect_size/min_sensitivity.svg)|✔|✖️ Mutual Expectation
([Dias et al., 1999](#ref-dias-et-al-1999))|![Formula](/doc/measures/effect_size/me.svg)|✔|✖️ -Mutual information
([Dunning, 1998, pp. 49–52](#ref-dunning-1998))|![Formula](/doc/measures/effect_size/mi.svg)|✔|✖️ +Mutual information
([Dunning, 1998, pp. 49–52](#ref-dunning-1998))|![Formula](/doc/measures/effect_size/mi.svg)
where **base** is the base of the logarithm, whose value could be modified via **Menu Bar → Preferences → Settings → Measures → Effect Size → Mutual Information → Base of logarithm**.|✔|✖️ Odds ratio
([Pecina, 2005, p. 15](#ref-pecina-2005), [Pojanapunya & Todd, 2016](#ref-pojanapunya-todd-2016))|![Formula](/doc/measures/effect_size/odds_ratio.svg)|✔|✔ %DIFF
([Gabrielatos & Marchi, 2011](#ref-gabrielatos-marchi-2011))|![Formula](/doc/measures/effect_size/pct_diff.svg)|✖️|✔ -Pointwise mutual information
([Church & Hanks, 1990](#ref-church-hanks-1990); [Kilgarriff, 2001, pp. 104–105](#ref-kilgarriff-2001))|![Formula](/doc/measures/effect_size/pmi.svg)|✔|✔ -Pointwise mutual information (cubic)**¹**
([Daille, 1994, p. 139](#ref-daille-1994); [Kilgarriff, 2001, p, 99](#ref-kilgarriff-2001))|![Formula](/doc/measures/effect_size/im3.svg)|✔|✔ -Pointwise mutual information (squared)**¹**
([Daille, 1995, p. 21](#ref-daille-1995); [Kilgarriff, 2001, p, 99](#ref-kilgarriff-2001))|![Formula](/doc/measures/effect_size/im2.svg)|✔|✔ +Pointwise mutual information
([Church & Hanks, 1990](#ref-church-hanks-1990); [Kilgarriff, 2001, pp. 104–105](#ref-kilgarriff-2001))|![Formula](/doc/measures/effect_size/pmi.svg)
where **base** is the base of the logarithm, whose value could be modified via **Menu Bar → Preferences → Settings → Measures → Effect Size → Pointwise Mutual Information → Base of logarithm**.|✔|✔ +Pointwise mutual information (cubic)**¹**
([Daille, 1994, p. 139](#ref-daille-1994); [Kilgarriff, 2001, p, 99](#ref-kilgarriff-2001))|![Formula](/doc/measures/effect_size/im3.svg)
where **base** is the base of the logarithm, whose value could be modified via **Menu Bar → Preferences → Settings → Measures → Effect Size → Pointwise Mutual Information (Cubic) → Base of logarithm**.|✔|✔ +Pointwise mutual information (squared)**¹**
([Daille, 1995, p. 21](#ref-daille-1995); [Kilgarriff, 2001, p, 99](#ref-kilgarriff-2001))|![Formula](/doc/measures/effect_size/im2.svg)
where **base** is the base of the logarithm, whose value could be modified via **Menu Bar → Preferences → Settings → Measures → Effect Size → Pointwise Mutual Information (Squared) → Base of logarithm**.|✔|✔ Poisson collocation measure
([Quasthoff & Wolff, 2002](#ref-quasthoff-wolff-2002))|![Formula](/doc/measures/effect_size/poisson_collocation_measure.svg)|✔|✖️ Squared phi coefficient
([Church & Gale, 1991](#ref-church-gale-1991))|![Formula](/doc/measures/effect_size/squared_phi_coeff.svg)|✔|✖️ diff --git a/doc/measures/effect_size/im2.svg b/doc/measures/effect_size/im2.svg index 8cb48ef21..47bfa0cab 100644 --- a/doc/measures/effect_size/im2.svg +++ b/doc/measures/effect_size/im2.svg @@ -1,34 +1,41 @@ - + - - - - - - - - - - + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/doc/measures/effect_size/im3.svg b/doc/measures/effect_size/im3.svg index f0dd8415c..c37f332e2 100644 --- a/doc/measures/effect_size/im3.svg +++ b/doc/measures/effect_size/im3.svg @@ -1,35 +1,41 @@ - - + + - - - - - - - - - - - + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/doc/measures/effect_size/mi.svg b/doc/measures/effect_size/mi.svg index fc6abde94..fc709ad20 100644 --- a/doc/measures/effect_size/mi.svg +++ b/doc/measures/effect_size/mi.svg @@ -1,6 +1,6 @@ - - + + @@ -11,51 +11,57 @@ + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/doc/measures/effect_size/pmi.svg b/doc/measures/effect_size/pmi.svg index c8e55e9c6..3128112b7 100644 --- a/doc/measures/effect_size/pmi.svg +++ b/doc/measures/effect_size/pmi.svg @@ -1,34 +1,40 @@ - - + + - - - - - - - - - - - + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/wordless/wl_measures/wl_measure_utils.py b/wordless/wl_measures/wl_measure_utils.py index 370bbf9dd..ec0bb8dc7 100644 --- a/wordless/wl_measures/wl_measure_utils.py +++ b/wordless/wl_measures/wl_measure_utils.py @@ -134,3 +134,9 @@ def numpy_log2(a, default = 0): return numpy.log2(a, out = numpy.full_like(a, default, dtype = float), where = a > 0) else: return numpy.log2(a, out = numpy.zeros_like(a, dtype = float), where = a > 0) + +def numpy_log10(a, default = 0): + if default: + return numpy.log10(a, out = numpy.full_like(a, default, dtype = float), where = a > 0) + else: + return numpy.log10(a, out = numpy.zeros_like(a, dtype = float), where = a > 0) diff --git a/wordless/wl_measures/wl_measures_effect_size.py b/wordless/wl_measures/wl_measures_effect_size.py index 98939dbaa..6a1a13334 100644 --- a/wordless/wl_measures/wl_measures_effect_size.py +++ b/wordless/wl_measures/wl_measures_effect_size.py @@ -18,6 +18,8 @@ # pylint: disable=unused-argument +import math + import numpy from wordless.wl_measures import wl_measures_statistical_significance, wl_measure_utils @@ -133,10 +135,18 @@ def mi(main, o11s, o12s, o21s, o22s): oxxs = o11s + o12s + o21s + o22s e11s, e12s, e21s, e22s = wl_measures_statistical_significance.get_freqs_expected(o11s, o12s, o21s, o22s) - mi_11 = wl_measure_utils.numpy_divide(o11s, oxxs) * wl_measure_utils.numpy_log2(wl_measure_utils.numpy_divide(o11s, e11s)) - mi_12 = wl_measure_utils.numpy_divide(o12s, oxxs) * wl_measure_utils.numpy_log2(wl_measure_utils.numpy_divide(o12s, e12s)) - mi_21 = wl_measure_utils.numpy_divide(o21s, oxxs) * wl_measure_utils.numpy_log2(wl_measure_utils.numpy_divide(o21s, e21s)) - mi_22 = wl_measure_utils.numpy_divide(o22s, oxxs) * wl_measure_utils.numpy_log2(wl_measure_utils.numpy_divide(o22s, e22s)) + match main.settings_custom['measures']['effect_size']['mi']['base_log']: + case 2: + numpy_log = wl_measure_utils.numpy_log2 + case 10: + numpy_log = wl_measure_utils.numpy_log10 + case math.e: + numpy_log = wl_measure_utils.numpy_log + + mi_11 = wl_measure_utils.numpy_divide(o11s, oxxs) * numpy_log(wl_measure_utils.numpy_divide(o11s, e11s)) + mi_12 = wl_measure_utils.numpy_divide(o12s, oxxs) * numpy_log(wl_measure_utils.numpy_divide(o12s, e12s)) + mi_21 = wl_measure_utils.numpy_divide(o21s, oxxs) * numpy_log(wl_measure_utils.numpy_divide(o21s, e21s)) + mi_22 = wl_measure_utils.numpy_divide(o22s, oxxs) * numpy_log(wl_measure_utils.numpy_divide(o22s, e22s)) return mi_11 + mi_12 + mi_21 + mi_22 @@ -179,21 +189,45 @@ def pct_diff(main, o11s, o12s, o21s, o22s): def pmi(main, o11s, o12s, o21s, o22s): e11s, _, _, _ = wl_measures_statistical_significance.get_freqs_expected(o11s, o12s, o21s, o22s) - return wl_measure_utils.numpy_log2(wl_measure_utils.numpy_divide(o11s, e11s)) + match main.settings_custom['measures']['effect_size']['pmi']['base_log']: + case 2: + numpy_log = wl_measure_utils.numpy_log2 + case 10: + numpy_log = wl_measure_utils.numpy_log10 + case math.e: + numpy_log = wl_measure_utils.numpy_log + + return numpy_log(wl_measure_utils.numpy_divide(o11s, e11s)) # Pointwise mutual information (cubic) # Reference: Daille, B. (1994). Approche mixte pour l'extraction automatique de terminologie: statistiques lexicales et filtres linguistiques [Doctoral thesis, Paris Diderot University]. Béatrice Daille. http://www.bdaille.com/index.php?option=com_docman&task=doc_download&gid=8&Itemid= | p. 139 def im3(main, o11s, o12s, o21s, o22s): e11s, _, _, _ = wl_measures_statistical_significance.get_freqs_expected(o11s, o12s, o21s, o22s) - return wl_measure_utils.numpy_log2(wl_measure_utils.numpy_divide(o11s ** 3, e11s)) + match main.settings_custom['measures']['effect_size']['im3']['base_log']: + case 2: + numpy_log = wl_measure_utils.numpy_log2 + case 10: + numpy_log = wl_measure_utils.numpy_log10 + case math.e: + numpy_log = wl_measure_utils.numpy_log + + return numpy_log(wl_measure_utils.numpy_divide(o11s ** 3, e11s)) # Pointwise mutual information (squared) # Reference: Daille, B. (1995). Combined approach for terminology extraction: Lexical statistics and linguistic filtering. UCREL technical papers (Vol. 5). Lancaster University. | p. 21 def im2(main, o11s, o12s, o21s, o22s): e11s, _, _, _ = wl_measures_statistical_significance.get_freqs_expected(o11s, o12s, o21s, o22s) - return wl_measure_utils.numpy_log2(wl_measure_utils.numpy_divide(o11s ** 2, e11s)) + match main.settings_custom['measures']['effect_size']['im2']['base_log']: + case 2: + numpy_log = wl_measure_utils.numpy_log2 + case 10: + numpy_log = wl_measure_utils.numpy_log10 + case math.e: + numpy_log = wl_measure_utils.numpy_log + + return numpy_log(wl_measure_utils.numpy_divide(o11s ** 2, e11s)) # Poisson collocation measure # Reference: Quasthoff, U., & Wolff, C. (2002). The poisson collocation measure and its applications. Proceedings of 2nd International Workshop on Computational Approaches to Collocations. IEEE. diff --git a/wordless/wl_settings/wl_settings_default.py b/wordless/wl_settings/wl_settings_default.py index 4b93e6352..2f3dcc535 100644 --- a/wordless/wl_settings/wl_settings_default.py +++ b/wordless/wl_settings/wl_settings_default.py @@ -2433,6 +2433,22 @@ def init_settings_default(main): 'effect_size': { 'kilgarriffs_ratio': { 'smoothing_param': 1.00 + }, + + 'mi': { + 'base_log': 2 + }, + + 'pmi': { + 'base_log': 2 + }, + + 'im3': { + 'base_log': 2 + }, + + 'im2': { + 'base_log': 2 } } }, diff --git a/wordless/wl_settings/wl_settings_measures.py b/wordless/wl_settings/wl_settings_measures.py index 91f00ef1f..f3a3bf914 100644 --- a/wordless/wl_settings/wl_settings_measures.py +++ b/wordless/wl_settings/wl_settings_measures.py @@ -17,12 +17,16 @@ # ---------------------------------------------------------------------- import copy +import math +from PyQt5.QtCore import QCoreApplication from PyQt5.QtWidgets import QCheckBox, QGroupBox, QLabel from wordless.wl_settings import wl_settings from wordless.wl_widgets import wl_boxes, wl_layouts, wl_widgets +_tr = QCoreApplication.translate + # Measures - Readability class Wl_Settings_Measures_Readability(wl_settings.Wl_Settings_Node): def __init__(self, main): @@ -950,6 +954,34 @@ def apply_settings(self): return True # Measures - Effect Size +class Wl_Combo_Box_Base_Log(wl_boxes.Wl_Combo_Box): + # pylint: disable=inconsistent-return-statements + + def __init__(self, parent): + super().__init__(parent) + + self.addItems([ + '2', + '10', + _tr('wl_settings_measures', 'Base of natural logarithm') + ]) + + def get_base_log(self): + if self.currentText() == '2': + return 2 + elif self.currentText() == '10': + return 10 + elif self.currentText() == _tr('wl_settings_measures', 'Base of natural logarithm'): + return math.e + + def set_base_log(self, base_log): + if base_log == 2: + self.setCurrentText('2') + elif base_log == 10: + self.setCurrentText('10') + elif base_log == math.e: + self.setCurrentText(_tr('wl_settings_measures', 'Base of natural logarithm')) + class Wl_Settings_Measures_Effect_Size(wl_settings.Wl_Settings_Node): def __init__(self, main): super().__init__(main) @@ -971,11 +1003,63 @@ def __init__(self, main): self.group_box_kilgarriffs_ratio.layout().setColumnStretch(2, 1) + # Mutual Information + self.group_box_mi = QGroupBox(self.tr('Mutual Information'), self) + + self.label_mi_base_log = QLabel(self.tr('Base of logarithm:'), self) + self.combo_box_mi_base_log = Wl_Combo_Box_Base_Log(self) + + self.group_box_mi.setLayout(wl_layouts.Wl_Layout()) + self.group_box_mi.layout().addWidget(self.label_mi_base_log, 0, 0) + self.group_box_mi.layout().addWidget(self.combo_box_mi_base_log, 0, 1) + + self.group_box_mi.layout().setColumnStretch(2, 1) + + # Pointwise Mutual Information + self.group_box_pmi = QGroupBox(self.tr('Pointwise Mutual Information'), self) + + self.label_pmi_base_log = QLabel(self.tr('Base of logarithm:'), self) + self.combo_box_pmi_base_log = Wl_Combo_Box_Base_Log(self) + + self.group_box_pmi.setLayout(wl_layouts.Wl_Layout()) + self.group_box_pmi.layout().addWidget(self.label_pmi_base_log, 0, 0) + self.group_box_pmi.layout().addWidget(self.combo_box_pmi_base_log, 0, 1) + + self.group_box_pmi.layout().setColumnStretch(2, 1) + + # Pointwise Mutual Information (Cubic) + self.group_box_im3 = QGroupBox(self.tr('Pointwise Mutual Information (Cubic)'), self) + + self.label_im3_base_log = QLabel(self.tr('Base of logarithm:'), self) + self.combo_box_im3_base_log = Wl_Combo_Box_Base_Log(self) + + self.group_box_im3.setLayout(wl_layouts.Wl_Layout()) + self.group_box_im3.layout().addWidget(self.label_im3_base_log, 0, 0) + self.group_box_im3.layout().addWidget(self.combo_box_im3_base_log, 0, 1) + + self.group_box_im3.layout().setColumnStretch(2, 1) + + # Pointwise Mutual Information (Squared) + self.group_box_im2 = QGroupBox(self.tr('Pointwise Mutual Information (Squared)'), self) + + self.label_im2_base_log = QLabel(self.tr('Base of logarithm:'), self) + self.combo_box_im2_base_log = Wl_Combo_Box_Base_Log(self) + + self.group_box_im2.setLayout(wl_layouts.Wl_Layout()) + self.group_box_im2.layout().addWidget(self.label_im2_base_log, 0, 0) + self.group_box_im2.layout().addWidget(self.combo_box_im2_base_log, 0, 1) + + self.group_box_im2.layout().setColumnStretch(2, 1) + self.setLayout(wl_layouts.Wl_Layout()) self.layout().addWidget(self.group_box_kilgarriffs_ratio, 0, 0) + self.layout().addWidget(self.group_box_mi, 1, 0) + self.layout().addWidget(self.group_box_pmi, 2, 0) + self.layout().addWidget(self.group_box_im3, 3, 0) + self.layout().addWidget(self.group_box_im2, 4, 0) self.layout().setContentsMargins(6, 4, 6, 4) - self.layout().setRowStretch(1, 1) + self.layout().setRowStretch(5, 1) def load_settings(self, defaults = False): if defaults: @@ -986,8 +1070,32 @@ def load_settings(self, defaults = False): # Kilgarriff's Ratio self.spin_box_kilgarriffs_ratio_smoothing_param.setValue(settings['kilgarriffs_ratio']['smoothing_param']) + # Mutual Information + self.combo_box_mi_base_log.set_base_log(settings['mi']['base_log']) + + # Pointwise Mutual Information + self.combo_box_pmi_base_log.set_base_log(settings['pmi']['base_log']) + + # Pointwise Mutual Information (Cubic) + self.combo_box_im3_base_log.set_base_log(settings['im3']['base_log']) + + # Pointwise Mutual Information (Squared) + self.combo_box_im2_base_log.set_base_log(settings['im2']['base_log']) + def apply_settings(self): # Kilgarriff's Ratio self.settings_custom['kilgarriffs_ratio']['smoothing_param'] = self.spin_box_kilgarriffs_ratio_smoothing_param.value() + # Mutual Information + self.settings_custom['mi']['base_log'] = self.combo_box_mi_base_log.get_base_log() + + # Pointwise Mutual Information + self.settings_custom['pmi']['base_log'] = self.combo_box_pmi_base_log.get_base_log() + + # Pointwise Mutual Information (Cubic) + self.settings_custom['im3']['base_log'] = self.combo_box_im3_base_log.get_base_log() + + # Pointwise Mutual Information (Squared) + self.settings_custom['im2']['base_log'] = self.combo_box_im2_base_log.get_base_log() + return True