-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLequel.h
71 lines (59 loc) · 1.71 KB
/
Lequel.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
/**
* @file Lequel.h
* @author Marc S. Ressl, Albertina Galan, Alejandro Nahuel Heir
* @brief Language identification based on trigrams.
* @version 0.1
* @date 2022-03-29
*
* @copyright Copyright (c) 2022
*
*/
#ifndef _LEQUEL_H
#define _LEQUEL_H
#include <list>
#include <map>
#include <string>
#include "Text.h"
// TrigramProfile: Map of trigram -> frequency
typedef std::map<std::string, float> TrigramProfile;
// TrigramList: list of trigrams
typedef std::list<std::string> TrigramList;
class Language
{
public:
std::string languageCode;
TrigramProfile trigramProfile;
};
typedef std::list<Language> Languages;
// Functions
/**
* @brief Builds a trigram profile for a given text.
*
* @param text: A list of lines (Text).
* @return TrigramProfile
*/
TrigramProfile buildTrigramProfile(const Text &text);
/**
* @brief Normalizes a trigram profile.
*
* @param trigramProfile
*/
void normalizeTrigramProfile(TrigramProfile &trigramProfile);
/**
* @brief Calculates the cosine similarity between a text trigram profile
* and a language trigram profile.
*
* @param textProfile: Text trigram profile.
* @param languageProfile: Language trigram profile.
* @return float: the cosine similarity score.
*/
float getCosineSimilarity(TrigramProfile &textProfile, TrigramProfile &languageProfile);
/**
* @brief Identifies the language of a text.
*
* @param text: A list of lines (Text).
* @param languages: the trigram profiles of all languages
* @param languagesMatched: an array of strings that will contain the top 3 languages matched
*/
void identifyLanguage(const Text &text, Languages &languages, std::string *languagesMatched);
#endif