Skip to content

Commit

Permalink
updated version 0.4.1
Browse files Browse the repository at this point in the history
  • Loading branch information
xtonev committed Nov 8, 2019
1 parent d78d856 commit 8a97b71
Show file tree
Hide file tree
Showing 12 changed files with 2,566 additions and 53 deletions.
76 changes: 53 additions & 23 deletions topicnet/cooking_machine/config_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def __init__(self, num_iters: int = 5)

from .cubes import PerplexityStrategy, GreedyStrategy
from .model_constructor import init_simple_default_model, create_default_topics
from .rel_toolbox_lite import count_vocab_size, handle_regularizer

import artm

Expand Down Expand Up @@ -201,7 +202,11 @@ def build_schema_for_regs():
for elem in artm.regularizers.__all__:
if "Regularizer" in elem:
class_of_object = getattr(artm.regularizers, elem)
res = wrap_in_map(build_schema_from_signature(class_of_object))
res = build_schema_from_signature(class_of_object)
if elem in ["SmoothSparseThetaRegularizer", "SmoothSparsePhiRegularizer",
"DecorrelatorPhiRegularizer"]:
res[Optional("relative", default=None)] = Bool()
res = wrap_in_map(res)

specific_schema = Map({class_of_object.__name__: res})
schemas[class_of_object.__name__] = specific_schema
Expand Down Expand Up @@ -392,11 +397,44 @@ def build_cube_settings(elemtype, elem_args):
"selection": elem_args['selection'].data}


def parse(yaml_string):
def _add_parsed_scores(parsed, topic_model):
""" """
for score in parsed.data.get('scores', []):
for elemtype, elem_args in score.items():
is_artm_score = elemtype in artm.scores.__all__
score_object = build_score(elemtype, elem_args, is_artm_score)
if is_artm_score:
topic_model._model.scores.add(score_object, overwrite=True)
else:
topic_model.custom_scores[elemtype] = score_object


def _add_parsed_regularizers(
parsed, model, specific_topic_names, background_topic_names, data_stats
):
""" """
regularizers = []
for stage in parsed.data['regularizers']:
for elemtype, elem_args in stage.items():
should_be_relative = None
if "relative" in elem_args:
should_be_relative = elem_args["relative"]
elem_args.pop("relative")

regularizer_object = build_regularizer(
elemtype, elem_args, specific_topic_names, background_topic_names
)
handle_regularizer(should_be_relative, model, regularizer_object, data_stats)
regularizers.append(model.regularizers[regularizer_object.name])
return regularizers


def parse(yaml_string, force_single_thread=False):
"""
Parameters
----------
yaml_string : str
force_single_thread : bool
Returns
-------
Expand All @@ -418,39 +456,30 @@ def parse(yaml_string):
revalidate_section(parsed, "scores")

cube_settings = []
regularizers = []

dataset = Dataset(parsed.data["model"]["dataset_path"])
modalities_to_use = parsed.data["model"]["modalities_to_use"]

data_stats = count_vocab_size(dataset.get_dictionary(), modalities_to_use)
model = init_simple_default_model(
dataset=dataset,
modalities_to_use=parsed.data["model"]["modalities_to_use"],
modalities_to_use=modalities_to_use,
main_modality=parsed.data["model"]["main_modality"],
specific_topics=parsed.data["topics"]["specific_topics"],
background_topics=parsed.data["topics"]["background_topics"],
)
for stage in parsed.data['regularizers']:
for elemtype, elem_args in stage.items():

regularizer_object = build_regularizer(
elemtype, elem_args, specific_topic_names, background_topic_names
)
regularizers.append(regularizer_object)
model.regularizers.add(regularizer_object, overwrite=True)

regularizers = _add_parsed_regularizers(
parsed, model, specific_topic_names, background_topic_names, data_stats
)
topic_model = TopicModel(model)

for score in parsed.data.get('scores', []):
for elemtype, elem_args in score.items():
is_artm_score = elemtype in artm.scores.__all__
score_object = build_score(elemtype, elem_args, is_artm_score)
if is_artm_score:
model.scores.add(score_object, overwrite=True)
else:
topic_model.custom_scores[elemtype] = score_object
_add_parsed_scores(parsed, topic_model)

for stage in parsed['stages']:
for elemtype, elem_args in stage.items():
settings = build_cube_settings(elemtype.data, elem_args)
if force_single_thread:
settings[elemtype]["separate_thread"] = False
cube_settings.append(settings)

return cube_settings, regularizers, topic_model, dataset
Expand Down Expand Up @@ -486,8 +515,9 @@ def revalidate_section(parsed, section):
stage.revalidate(local_schema)


def build_experiment_environment_from_yaml_config(yaml_string, experiment_id, save_path):
settings, regs, model, dataset = parse(yaml_string)
def build_experiment_environment_from_yaml_config(yaml_string, experiment_id,
save_path, force_single_thread=False):
settings, regs, model, dataset = parse(yaml_string, force_single_thread)
# TODO: handle dynamic addition of regularizers
experiment = Experiment(experiment_id=experiment_id, save_path=save_path, topic_model=model)
experiment.build(settings)
Expand Down
1 change: 0 additions & 1 deletion topicnet/cooking_machine/cubes/controller_cube.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,6 @@ def apply(self, topic_model, one_model_parameter, dictionary=None, model_id=None
handle_regularizer(
self._relative,
new_model,
modalities,
new_regularizer,
self.data_stats,
)
Expand Down
12 changes: 8 additions & 4 deletions topicnet/cooking_machine/cubes/regularizer_cube.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,9 +144,14 @@ def apply(self, topic_model, one_model_parameter, dictionary=None, model_id=None
regularizer_type = str(type(regularizer))
if isinstance(regularizer, dict):
if regularizer['name'] in new_model.regularizers.data:
setattr(new_model.regularizers[regularizer['name']],
field_name,
params)
new_regularizer = deepcopy(new_model.regularizers[regularizer['name']])
new_regularizer._tau = params
handle_regularizer(
self._relative,
new_model,
new_regularizer,
self.data_stats,
)
else:
error_msg = (f"Regularizer {regularizer['name']} does not exist. "
f"Cannot be modified.")
Expand All @@ -157,7 +162,6 @@ def apply(self, topic_model, one_model_parameter, dictionary=None, model_id=None
handle_regularizer(
self._relative,
new_model,
modalities,
new_regularizer,
self.data_stats,
)
Expand Down
14 changes: 14 additions & 0 deletions topicnet/cooking_machine/models/topic_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -574,3 +574,17 @@ def regularizers(self):
def class_ids(self):
""" """
return self._model.class_ids

def describe_scores(self):
data = []
for score_name, score in self.scores.items():
data.append([self.model_id, score_name, score[-1]])
result = pd.DataFrame(columns=["model_id", "score_name", "last_value"], data=data)
return result.set_index(["model_id", "score_name"])

def describe_regularizers(self):
data = []
for reg_name, reg in self.regularizers._data.items():
data.append([self.model_id, reg_name, reg.tau, reg.gamma])
result = pd.DataFrame(columns=["model_id", "regularizer_name", "tau", "gamma"], data=data)
return result.set_index(["model_id", "regularizer_name"])
24 changes: 7 additions & 17 deletions topicnet/cooking_machine/recipes/ARTM_baseline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,44 +24,34 @@ regularizers:
topic_names: background_topics
class_ids: {modality_list}
tau: 0.1
relative: true
- SmoothSparseThetaRegularizer:
name: smooth_theta_bcg
topic_names: background_topics
tau: 0.1
relative: true
scores:
- BleiLaffertyScore:
num_top_tokens: 15
num_top_tokens: 30
model:
dataset_path: {dataset_path}
modalities_to_use: {modality_list}
main_modality: '{main_modality}'

stages:
- RegularizersModifierCube:
num_iter: 1
reg_search: grid
regularizer_parameters:
- name: smooth_phi_bcg
tau_grid: [0.1]
- name: smooth_theta_bcg
tau_grid: [0.1]
selection:
- COLLECT 1
verbose: false
relative_coefficients: True
- RegularizersModifierCube:
num_iter: 20
reg_search: add
regularizer_parameters:
name: decorrelation_phi
selection:
- PerplexityScore@all < 1.01 * MINIMUM(PerplexityScore@all) and BleiLaffertyScore -> max
- PerplexityScore@all < 1.05 * MINIMUM(PerplexityScore@all) and BleiLaffertyScore -> max
strategy: PerplexityStrategy
# parameters of this strategy are intended for revision
strategy_params:
start_point: 0
step: 1000
max_len: 10000
step: 0.01
max_len: 50
tracked_score_function: PerplexityScore@all
verbose: false
relative_coefficients: false
relative_coefficients: true
6 changes: 2 additions & 4 deletions topicnet/cooking_machine/rel_toolbox_lite.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def modality_weight_rel2abs(tokens_data, weights, default_modality):
return taus


def handle_regularizer(use_relative_coefficients, model, modalities, regularizer, data_stats):
def handle_regularizer(use_relative_coefficients, model, regularizer, data_stats):
"""
Handles the case of various regularizers that
contain 'Regularizer' in their name, namely all artm regularizers
Expand All @@ -171,8 +171,6 @@ def handle_regularizer(use_relative_coefficients, model, modalities, regularizer
indicates whether regularizer should be altered
model : TopicModel or artm.ARTM
to be changed in place
modalities : dict
modalities used in the model
regularizer : an instance of Regularizer from artm library
data_stats : dict
collection-specific data
Expand All @@ -195,7 +193,7 @@ def handle_regularizer(use_relative_coefficients, model, modalities, regularizer
regularizer = transform_regularizer(
data_stats,
regularizer,
modalities,
model.class_ids,
n_topics,
)

Expand Down
Loading

0 comments on commit 8a97b71

Please sign in to comment.