-
Notifications
You must be signed in to change notification settings - Fork 74
v0.2.48..v0.2.49 changeset ImplicitTagRawRulesDeriver.h
Garret Voltz edited this page Oct 2, 2019
·
1 revision
diff --git a/hoot-core/src/main/cpp/hoot/core/schema/ImplicitTagRawRulesDeriver.h b/hoot-core/src/main/cpp/hoot/core/schema/ImplicitTagRawRulesDeriver.h
new file mode 100644
index 0000000..2256e8e
--- /dev/null
+++ b/hoot-core/src/main/cpp/hoot/core/schema/ImplicitTagRawRulesDeriver.h
@@ -0,0 +1,166 @@
+/*
+ * This file is part of Hootenanny.
+ *
+ * Hootenanny is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * --------------------------------------------------------------------
+ *
+ * The following copyright notices are generated automatically. If you
+ * have a new notice to add, please use the format:
+ * " * @copyright Copyright ..."
+ * This will properly maintain the copyright information. DigitalGlobe
+ * copyrights will be updated automatically.
+ *
+ * @copyright Copyright (C) 2017, 2018, 2019 DigitalGlobe (http://www.digitalglobe.com/)
+ */
+#ifndef IMPLICITTAGRAWRULESDERIVER_H
+#define IMPLICITTAGRAWRULESDERIVER_H
+
+// Hoot
+#include <hoot/core/util/Configurable.h>
+#include <hoot/core/criterion/ImplicitTagEligibleCriterion.h>
+#include <hoot/core/io/ElementInputStream.h>
+#include <hoot/core/io/PartialOsmMapReader.h>
+#include <hoot/core/algorithms/string/StringTokenizer.h>
+#include <hoot/core/language/ToEnglishTranslator.h>
+
+// Qt
+#include <QString>
+#include <QHash>
+#include <QTemporaryFile>
+
+namespace hoot
+{
+
+class Tags;
+
+/**
+ * Used to derive implicit tag raw rules for elements and writes the output to a flat file
+ *
+ * The logic in this class is separated from that in ImplicitTagRulesDatabaseDeriver due to the fact
+ * that the logic in this class takes a significant amount of time to run against large input
+ * datasets (~5.5 hours to run against both global GeoNames and OSM at the time of this writing).
+ * The raw rules generated by this class can then be tweaked repeatedly with a number of settings
+ * when deriving the rules database with ImplicitTagRulesDatabaseDeriver. Databases typically take
+ * less than a minute to generate. So from one long raw rules file processing run by this class,
+ * many test rules databases can be generated by ImplicitTagRulesDatabaseDeriver.
+ */
+class ImplicitTagRawRulesDeriver : public Configurable
+{
+
+public:
+
+ ImplicitTagRawRulesDeriver();
+
+ /**
+ * Derives implicit tag rules for POIs given input data and writes the rules to output
+ *
+ * @param inputs a list of hoot supported feature input formats to derive rules from
+ * @param translationScripts list of OSM translation scripts corresponding to the datasets
+ * specified by the inputs parameter
+ * @param output the file to write the rules to
+ */
+ void deriveRawRules(const QStringList& inputs, const QStringList& translationScripts,
+ const QString& output);
+
+ virtual void setConfiguration(const Settings& conf);
+
+ void setSortParallelCount(int count) { _sortParallelCount = count; }
+ void setSkipFiltering(bool skip) { _skipFiltering = skip; }
+ void setKeepTempFiles(bool keep) { _keepTempFiles = keep; }
+ void setTempFileDir(const QString& dir) { _tempFileDir = dir; }
+ void setTranslateNamesToEnglish(bool translate) { _translateNamesToEnglish = translate; }
+ void setElementCriterion(const QString& criterionName);
+
+private:
+
+ //for testing
+ friend class ImplicitTagRawRulesDeriverTest;
+
+ long _statusUpdateInterval;
+ long _countFileLineCtr;
+ //number of threads to use when calling Unix sort command; too high of a value in some VM
+ //environments can cause memory issues due to the VM OS not releasing the memory used by the
+ //command after it finishes
+ int _sortParallelCount;
+ //completely skip filtering out ineligible elements (those which don't satisfy _elementCriterion);
+ //to be used only when the input data has been pre-filtered
+ bool _skipFiltering;
+ //will keep all temp files; very useful for debugging sort work done by the Unix commands
+ bool _keepTempFiles;
+ QString _tempFileDir;
+ //if true; all element names are first translated to english before a raw rule is derived
+ bool _translateNamesToEnglish;
+
+ //contains the word/tag occurrence counts at various stages; line format:
+ //<count>\t<word>\t<key=value>
+ std::shared_ptr<QTemporaryFile> _countFile;
+ std::shared_ptr<QTemporaryFile> _sortedCountFile;
+ std::shared_ptr<QTemporaryFile> _dedupedCountFile;
+ std::shared_ptr<QTemporaryFile> _tieResolvedCountFile;
+
+ //final output file
+ std::shared_ptr<QFile> _output;
+
+ //maps the a name token concatenated with a tag key to a tag value
+ QHash<QString, QString> _wordKeysToCountsValues;
+ //same as above but used to combine duplicated count file lines into single lines
+ QHash<QString, QStringList> _duplicatedWordTagKeyCountsToValues;
+
+ StringTokenizer _tokenizer;
+
+ std::shared_ptr<PartialOsmMapReader> _inputReader;
+
+ //controls which elements have tags harvested from them
+ std::shared_ptr<ImplicitTagEligibleCriterion> _elementCriterion;
+
+ //translates names to English
+ std::shared_ptr<ToEnglishTranslator> _translator;
+
+ void _init();
+ void _validateInputs(const QStringList& inputs, const QStringList& translationScripts,
+ const QString& output);
+ std::shared_ptr<ElementInputStream> _getInputStream(const QString& input,
+ const QString& translationScript);
+
+ /*
+ * Examine each word token to determine if a raw implicit tag rule should be created for it
+ */
+ void _updateForNewWord(const QString& word, const QString& kvp);
+ /*
+ * Gets tags to generate raw implicit tag rules from
+ */
+ QStringList _getPoiKvps(const Tags& tags) const;
+
+ void _parseNames(const QStringList& names, const QStringList& kvps);
+ void _parseNameToken(QString& nameToken, const QStringList& kvps);
+
+ /*
+ * Sorts word/tag occurrence count lines by descending occurrence count
+ */
+ void _sortByTagOccurrence();
+ /*
+ * Ensure that no two tag keys have the same word and occurrence count line
+ */
+ void _removeDuplicatedKeyTypes();
+ /*
+ * In cases where these is a word/tag key occurrence count tie, this resolves the tie.
+ */
+ void _resolveCountTies();
+ void _sortByWord(const std::shared_ptr<QTemporaryFile>& input);
+};
+
+}
+
+#endif // IMPLICITTAGRAWRULESDERIVER_H