Skip to content

Commit

Permalink
Settings: Add Settings - Measures - Effect Size - Mutual Information …
Browse files Browse the repository at this point in the history
…/ Pointwise Mutual Information / Pointwise Mutual Information (Cubic) / Pointwise Mutual Information (Squared)
  • Loading branch information
BLKSerene committed Nov 4, 2024
1 parent db1cc92 commit b61e9b2
Show file tree
Hide file tree
Showing 10 changed files with 352 additions and 162 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
## [3.6.0](https://github.com/BLKSerene/Wordless/releases/tag/3.6.0) - ??/??/2024
### 🎉 New Features
- Measures: Add effect size - conditional probability / ΔP / squared association ratio
- Settings: Add Settings - Measures - Effect Size - Mutual Information / Pointwise Mutual Information / Pointwise Mutual Information (Cubic) / Pointwise Mutual Information (Squared)
- Utils: Add Stanza's Sindhi dependency parser

### 📌 Bugfixes
Expand Down
54 changes: 27 additions & 27 deletions doc/doc.md

Large diffs are not rendered by default.

63 changes: 35 additions & 28 deletions doc/measures/effect_size/im2.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
66 changes: 36 additions & 30 deletions doc/measures/effect_size/im3.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
86 changes: 46 additions & 40 deletions doc/measures/effect_size/mi.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
64 changes: 35 additions & 29 deletions doc/measures/effect_size/pmi.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
6 changes: 6 additions & 0 deletions wordless/wl_measures/wl_measure_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,3 +134,9 @@ def numpy_log2(a, default = 0):
return numpy.log2(a, out = numpy.full_like(a, default, dtype = float), where = a > 0)
else:
return numpy.log2(a, out = numpy.zeros_like(a, dtype = float), where = a > 0)

def numpy_log10(a, default = 0):
if default:
return numpy.log10(a, out = numpy.full_like(a, default, dtype = float), where = a > 0)
else:
return numpy.log10(a, out = numpy.zeros_like(a, dtype = float), where = a > 0)
48 changes: 41 additions & 7 deletions wordless/wl_measures/wl_measures_effect_size.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@

# pylint: disable=unused-argument

import math

import numpy

from wordless.wl_measures import wl_measures_statistical_significance, wl_measure_utils
Expand Down Expand Up @@ -133,10 +135,18 @@ def mi(main, o11s, o12s, o21s, o22s):
oxxs = o11s + o12s + o21s + o22s
e11s, e12s, e21s, e22s = wl_measures_statistical_significance.get_freqs_expected(o11s, o12s, o21s, o22s)

mi_11 = wl_measure_utils.numpy_divide(o11s, oxxs) * wl_measure_utils.numpy_log2(wl_measure_utils.numpy_divide(o11s, e11s))
mi_12 = wl_measure_utils.numpy_divide(o12s, oxxs) * wl_measure_utils.numpy_log2(wl_measure_utils.numpy_divide(o12s, e12s))
mi_21 = wl_measure_utils.numpy_divide(o21s, oxxs) * wl_measure_utils.numpy_log2(wl_measure_utils.numpy_divide(o21s, e21s))
mi_22 = wl_measure_utils.numpy_divide(o22s, oxxs) * wl_measure_utils.numpy_log2(wl_measure_utils.numpy_divide(o22s, e22s))
match main.settings_custom['measures']['effect_size']['mi']['base_log']:
case 2:
numpy_log = wl_measure_utils.numpy_log2
case 10:
numpy_log = wl_measure_utils.numpy_log10
case math.e:
numpy_log = wl_measure_utils.numpy_log

mi_11 = wl_measure_utils.numpy_divide(o11s, oxxs) * numpy_log(wl_measure_utils.numpy_divide(o11s, e11s))
mi_12 = wl_measure_utils.numpy_divide(o12s, oxxs) * numpy_log(wl_measure_utils.numpy_divide(o12s, e12s))
mi_21 = wl_measure_utils.numpy_divide(o21s, oxxs) * numpy_log(wl_measure_utils.numpy_divide(o21s, e21s))
mi_22 = wl_measure_utils.numpy_divide(o22s, oxxs) * numpy_log(wl_measure_utils.numpy_divide(o22s, e22s))

return mi_11 + mi_12 + mi_21 + mi_22

Expand Down Expand Up @@ -179,21 +189,45 @@ def pct_diff(main, o11s, o12s, o21s, o22s):
def pmi(main, o11s, o12s, o21s, o22s):
e11s, _, _, _ = wl_measures_statistical_significance.get_freqs_expected(o11s, o12s, o21s, o22s)

return wl_measure_utils.numpy_log2(wl_measure_utils.numpy_divide(o11s, e11s))
match main.settings_custom['measures']['effect_size']['pmi']['base_log']:
case 2:
numpy_log = wl_measure_utils.numpy_log2
case 10:
numpy_log = wl_measure_utils.numpy_log10
case math.e:
numpy_log = wl_measure_utils.numpy_log

return numpy_log(wl_measure_utils.numpy_divide(o11s, e11s))

# Pointwise mutual information (cubic)
# Reference: Daille, B. (1994). Approche mixte pour l'extraction automatique de terminologie: statistiques lexicales et filtres linguistiques [Doctoral thesis, Paris Diderot University]. Béatrice Daille. http://www.bdaille.com/index.php?option=com_docman&task=doc_download&gid=8&Itemid= | p. 139
def im3(main, o11s, o12s, o21s, o22s):
e11s, _, _, _ = wl_measures_statistical_significance.get_freqs_expected(o11s, o12s, o21s, o22s)

return wl_measure_utils.numpy_log2(wl_measure_utils.numpy_divide(o11s ** 3, e11s))
match main.settings_custom['measures']['effect_size']['im3']['base_log']:
case 2:
numpy_log = wl_measure_utils.numpy_log2
case 10:
numpy_log = wl_measure_utils.numpy_log10
case math.e:
numpy_log = wl_measure_utils.numpy_log

return numpy_log(wl_measure_utils.numpy_divide(o11s ** 3, e11s))

# Pointwise mutual information (squared)
# Reference: Daille, B. (1995). Combined approach for terminology extraction: Lexical statistics and linguistic filtering. UCREL technical papers (Vol. 5). Lancaster University. | p. 21
def im2(main, o11s, o12s, o21s, o22s):
e11s, _, _, _ = wl_measures_statistical_significance.get_freqs_expected(o11s, o12s, o21s, o22s)

return wl_measure_utils.numpy_log2(wl_measure_utils.numpy_divide(o11s ** 2, e11s))
match main.settings_custom['measures']['effect_size']['im2']['base_log']:
case 2:
numpy_log = wl_measure_utils.numpy_log2
case 10:
numpy_log = wl_measure_utils.numpy_log10
case math.e:
numpy_log = wl_measure_utils.numpy_log

return numpy_log(wl_measure_utils.numpy_divide(o11s ** 2, e11s))

# Poisson collocation measure
# Reference: Quasthoff, U., & Wolff, C. (2002). The poisson collocation measure and its applications. Proceedings of 2nd International Workshop on Computational Approaches to Collocations. IEEE.
Expand Down
16 changes: 16 additions & 0 deletions wordless/wl_settings/wl_settings_default.py
Original file line number Diff line number Diff line change
Expand Up @@ -2433,6 +2433,22 @@ def init_settings_default(main):
'effect_size': {
'kilgarriffs_ratio': {
'smoothing_param': 1.00
},

'mi': {
'base_log': 2
},

'pmi': {
'base_log': 2
},

'im3': {
'base_log': 2
},

'im2': {
'base_log': 2
}
}
},
Expand Down
110 changes: 109 additions & 1 deletion wordless/wl_settings/wl_settings_measures.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,16 @@
# ----------------------------------------------------------------------

import copy
import math

from PyQt5.QtCore import QCoreApplication
from PyQt5.QtWidgets import QCheckBox, QGroupBox, QLabel

from wordless.wl_settings import wl_settings
from wordless.wl_widgets import wl_boxes, wl_layouts, wl_widgets

_tr = QCoreApplication.translate

# Measures - Readability
class Wl_Settings_Measures_Readability(wl_settings.Wl_Settings_Node):
def __init__(self, main):
Expand Down Expand Up @@ -950,6 +954,34 @@ def apply_settings(self):
return True

# Measures - Effect Size
class Wl_Combo_Box_Base_Log(wl_boxes.Wl_Combo_Box):
# pylint: disable=inconsistent-return-statements

def __init__(self, parent):
super().__init__(parent)

self.addItems([
'2',
'10',
_tr('wl_settings_measures', 'Base of natural logarithm')
])

def get_base_log(self):
if self.currentText() == '2':
return 2
elif self.currentText() == '10':
return 10
elif self.currentText() == _tr('wl_settings_measures', 'Base of natural logarithm'):
return math.e

def set_base_log(self, base_log):
if base_log == 2:
self.setCurrentText('2')
elif base_log == 10:
self.setCurrentText('10')
elif base_log == math.e:
self.setCurrentText(_tr('wl_settings_measures', 'Base of natural logarithm'))

class Wl_Settings_Measures_Effect_Size(wl_settings.Wl_Settings_Node):
def __init__(self, main):
super().__init__(main)
Expand All @@ -971,11 +1003,63 @@ def __init__(self, main):

self.group_box_kilgarriffs_ratio.layout().setColumnStretch(2, 1)

# Mutual Information
self.group_box_mi = QGroupBox(self.tr('Mutual Information'), self)

self.label_mi_base_log = QLabel(self.tr('Base of logarithm:'), self)
self.combo_box_mi_base_log = Wl_Combo_Box_Base_Log(self)

self.group_box_mi.setLayout(wl_layouts.Wl_Layout())
self.group_box_mi.layout().addWidget(self.label_mi_base_log, 0, 0)
self.group_box_mi.layout().addWidget(self.combo_box_mi_base_log, 0, 1)

self.group_box_mi.layout().setColumnStretch(2, 1)

# Pointwise Mutual Information
self.group_box_pmi = QGroupBox(self.tr('Pointwise Mutual Information'), self)

self.label_pmi_base_log = QLabel(self.tr('Base of logarithm:'), self)
self.combo_box_pmi_base_log = Wl_Combo_Box_Base_Log(self)

self.group_box_pmi.setLayout(wl_layouts.Wl_Layout())
self.group_box_pmi.layout().addWidget(self.label_pmi_base_log, 0, 0)
self.group_box_pmi.layout().addWidget(self.combo_box_pmi_base_log, 0, 1)

self.group_box_pmi.layout().setColumnStretch(2, 1)

# Pointwise Mutual Information (Cubic)
self.group_box_im3 = QGroupBox(self.tr('Pointwise Mutual Information (Cubic)'), self)

self.label_im3_base_log = QLabel(self.tr('Base of logarithm:'), self)
self.combo_box_im3_base_log = Wl_Combo_Box_Base_Log(self)

self.group_box_im3.setLayout(wl_layouts.Wl_Layout())
self.group_box_im3.layout().addWidget(self.label_im3_base_log, 0, 0)
self.group_box_im3.layout().addWidget(self.combo_box_im3_base_log, 0, 1)

self.group_box_im3.layout().setColumnStretch(2, 1)

# Pointwise Mutual Information (Squared)
self.group_box_im2 = QGroupBox(self.tr('Pointwise Mutual Information (Squared)'), self)

self.label_im2_base_log = QLabel(self.tr('Base of logarithm:'), self)
self.combo_box_im2_base_log = Wl_Combo_Box_Base_Log(self)

self.group_box_im2.setLayout(wl_layouts.Wl_Layout())
self.group_box_im2.layout().addWidget(self.label_im2_base_log, 0, 0)
self.group_box_im2.layout().addWidget(self.combo_box_im2_base_log, 0, 1)

self.group_box_im2.layout().setColumnStretch(2, 1)

self.setLayout(wl_layouts.Wl_Layout())
self.layout().addWidget(self.group_box_kilgarriffs_ratio, 0, 0)
self.layout().addWidget(self.group_box_mi, 1, 0)
self.layout().addWidget(self.group_box_pmi, 2, 0)
self.layout().addWidget(self.group_box_im3, 3, 0)
self.layout().addWidget(self.group_box_im2, 4, 0)

self.layout().setContentsMargins(6, 4, 6, 4)
self.layout().setRowStretch(1, 1)
self.layout().setRowStretch(5, 1)

def load_settings(self, defaults = False):
if defaults:
Expand All @@ -986,8 +1070,32 @@ def load_settings(self, defaults = False):
# Kilgarriff's Ratio
self.spin_box_kilgarriffs_ratio_smoothing_param.setValue(settings['kilgarriffs_ratio']['smoothing_param'])

# Mutual Information
self.combo_box_mi_base_log.set_base_log(settings['mi']['base_log'])

# Pointwise Mutual Information
self.combo_box_pmi_base_log.set_base_log(settings['pmi']['base_log'])

# Pointwise Mutual Information (Cubic)
self.combo_box_im3_base_log.set_base_log(settings['im3']['base_log'])

# Pointwise Mutual Information (Squared)
self.combo_box_im2_base_log.set_base_log(settings['im2']['base_log'])

def apply_settings(self):
# Kilgarriff's Ratio
self.settings_custom['kilgarriffs_ratio']['smoothing_param'] = self.spin_box_kilgarriffs_ratio_smoothing_param.value()

# Mutual Information
self.settings_custom['mi']['base_log'] = self.combo_box_mi_base_log.get_base_log()

# Pointwise Mutual Information
self.settings_custom['pmi']['base_log'] = self.combo_box_pmi_base_log.get_base_log()

# Pointwise Mutual Information (Cubic)
self.settings_custom['im3']['base_log'] = self.combo_box_im3_base_log.get_base_log()

# Pointwise Mutual Information (Squared)
self.settings_custom['im2']['base_log'] = self.combo_box_im2_base_log.get_base_log()

return True

0 comments on commit b61e9b2

Please sign in to comment.