Skip to content

Commit

Permalink
LMAssembly // Pack LMUserOverride inside LMInstantiator, etc.
Browse files Browse the repository at this point in the history
  • Loading branch information
ShikiSuen committed Feb 23, 2024
1 parent c589915 commit e44843e
Show file tree
Hide file tree
Showing 35 changed files with 303 additions and 316 deletions.
10 changes: 5 additions & 5 deletions Packages/vChewing_LangModelAssembly/README.md
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
# LangModelAssembly

威注音輸入法的語言模組總成套裝
威注音輸入法的語言模組總成套裝,以 LMAssembly 命名空間承載下述唯二對外物件:

- vChewingLM:總命名空間,也承載一些在套裝內共用的工具函式。
- LMConsolidator:自動格式整理模組。
- LMInstantiator:語言模組副本化模組。另有其日期時間擴充模組可用(對 CIN 磁帶模式無效)。
- LMInstantiator:語言模組副本化模組,亦集成一些自身功能擴展。

LMAssembly 總命名空間也承載一些在套裝內共用的工具函式。

以下是子模組:

- lmCassette:專門用來處理 CIN 磁帶檔案的模組,命名為「遠野」引擎。
- LMAssociates:關聯詞語模組。
- lmCassette:專門用來處理 CIN 磁帶檔案的模組,命名為「遠野」引擎。
- LMCoreEX:可以直接讀取 TXT 格式的帶有權重資料的語彙檔案的模組。
- LMCoreJSON:專門用來讀取原廠 JSON 檔案的模組。
- lmPlainBopomofo:專門用來讀取使用者自訂ㄅ半候選字順序覆蓋定義檔案(plist)的模組。
- lmReplacements:專門用來讀取使用者語彙置換模式的辭典資料的模組。
- lmUserOverride:半衰記憶模組。
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,29 +11,31 @@ import Foundation
/// 工作原理:先用 InputToken.parse 分析原始字串,給出準確的 Token。
/// 然後再讓這個 Token 用 .translated() 自我表述出轉換結果。

public enum InputToken {
case timeZone(shortened: Bool)
case timeNow(shortened: Bool)
case date(dayDelta: Int = 0, yearDelta: Int = 0, shortened: Bool = true, luna: Bool = false)
case week(dayDelta: Int = 0, shortened: Bool = true)
case year(yearDelta: Int = 0)
case yearGanzhi(yearDelta: Int = 0)
case yearZodiac(yearDelta: Int = 0)
extension LMAssembly {
enum InputToken {
case timeZone(shortened: Bool)
case timeNow(shortened: Bool)
case date(dayDelta: Int = 0, yearDelta: Int = 0, shortened: Bool = true, luna: Bool = false)
case week(dayDelta: Int = 0, shortened: Bool = true)
case year(yearDelta: Int = 0)
case yearGanzhi(yearDelta: Int = 0)
case yearZodiac(yearDelta: Int = 0)
}
}

// MARK: - 正式對外投入使用的 API。

public extension String {
func parseAsInputToken(isCHS: Bool) -> [String] {
InputToken.parse(from: self).map { $0.translated(isCHS: isCHS) }.flatMap { $0 }.deduplicated
LMAssembly.InputToken.parse(from: self).map { $0.translated(isCHS: isCHS) }.flatMap { $0 }.deduplicated
}
}

// MARK: - Parser parsing raw token value to construct token.

public extension InputToken {
static func parse(from rawToken: String) -> [InputToken] {
var result: [InputToken] = []
extension LMAssembly.InputToken {
static func parse(from rawToken: String) -> [LMAssembly.InputToken] {
var result: [LMAssembly.InputToken] = []
guard rawToken.prefix(6) == "MACRO@" else { return result }
var mapParams: [String: Int] = [:]
let tokenComponents = rawToken.dropFirst(6).split(separator: "_").map { param in
Expand Down Expand Up @@ -69,7 +71,7 @@ public extension InputToken {

// MARK: - Parser parsing token itself.

public extension InputToken {
extension LMAssembly.InputToken {
func translated(isCHS: Bool) -> [String] {
let locale = Locale(identifier: isCHS ? "zh-Hans" : "zh-Hant-TW")
let formatter = DateFormatter()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import Foundation
import LineReader
import Shared

public extension vChewingLM {
public extension LMAssembly {
enum LMConsolidator {
public static let kPragmaHeader = "# 𝙵𝙾𝚁𝙼𝙰𝚃 𝚘𝚛𝚐.𝚊𝚝𝚎𝚕𝚒𝚎𝚛𝙸𝚗𝚖𝚞.𝚟𝚌𝚑𝚎𝚠𝚒𝚗𝚐.𝚞𝚜𝚎𝚛𝙻𝚊𝚗𝚐𝚞𝚊𝚐𝚎𝙼𝚘𝚍𝚎𝚕𝙳𝚊𝚝𝚊.𝚏𝚘𝚛𝚖𝚊𝚝𝚝𝚎𝚍"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import Megrez
import Shared
import SQLite3

public extension vChewingLM {
public extension LMAssembly {
/// 語言模組副本化模組(LMInstantiator,下稱「LMI」)自身為符合天權星組字引擎內
/// 的 LangModelProtocol 協定的模組、統籌且整理來自其它子模組的資料(包括使
/// 用者語彙、繪文字模組、語彙濾除表、原廠語言模組等)。
Expand Down Expand Up @@ -56,8 +56,12 @@ public extension vChewingLM {
public var config = Config()

// 這句需要留著,不然無法被 package 外界存取。
public init(isCHS: Bool = false) {
public init(
isCHS: Bool = false,
uomDataURL: URL? = nil
) {
self.isCHS = isCHS
lmUserOverride = .init(dataURL: uomDataURL)
}

public func setOptions(handler: (inout Config) -> Void) {
Expand Down Expand Up @@ -109,6 +113,9 @@ public extension vChewingLM {
var lmAssociates = LMAssociates()
var lmPlainBopomofo = LMPlainBopomofo()

// 半衰记忆模组
var lmUserOverride: LMUserOverride

// MARK: - 工具函式

public func resetFactoryJSONModels() {}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import Foundation
import Megrez
import Shared

public extension vChewingLM.LMInstantiator {
public extension LMAssembly.LMInstantiator {
/// 磁帶模式專用:當前磁帶所規定的花牌鍵。
var cassetteWildcardKey: String { Self.lmCassette.wildcardKey }
/// 磁帶模式專用:當前磁帶規定的最大碼長。
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import Megrez

// MARK: - 日期時間便捷輸入功能

extension vChewingLM.LMInstantiator {
extension LMAssembly.LMInstantiator {
func queryDateTimeUnigrams(with key: String = "") -> [Megrez.Unigram] {
guard let tokenTrigger = TokenTrigger(rawValue: key) else { return [] }
var results = [Megrez.Unigram]()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import Foundation
import Megrez

public extension vChewingLM.LMInstantiator {
public extension LMAssembly.LMInstantiator {
func supplyNumPadUnigrams(key: String) -> [Megrez.Unigram] {
guard let status = config.numPadFWHWStatus else { return [] }
let initials = "_NumPad_"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,30 +31,32 @@ import SQLite3
) WITHOUT ROWID;
*/

enum CoreColumn: Int32 {
case theDataCHS = 1 // 簡體中文
case theDataCHT = 2 // 繁體中文
case theDataCNS = 3 // 全字庫
case theDataMISC = 4 // 待辦
case theDataSYMB = 5 // 符號圖
case theDataCHEW = 6 // 注音文
extension LMAssembly.LMInstantiator {
enum CoreColumn: Int32 {
case theDataCHS = 1 // 簡體中文
case theDataCHT = 2 // 繁體中文
case theDataCNS = 3 // 全字庫
case theDataMISC = 4 // 待辦
case theDataSYMB = 5 // 符號圖
case theDataCHEW = 6 // 注音文

var name: String { String(describing: self) }
var name: String { String(describing: self) }

var id: Int32 { rawValue }
var id: Int32 { rawValue }

var defaultScore: Double {
switch self {
case .theDataCHEW: return -1
case .theDataCNS: return -11
case .theDataSYMB: return -13
case .theDataMISC: return -10
default: return -9.9
var defaultScore: Double {
switch self {
case .theDataCHEW: return -1
case .theDataCNS: return -11
case .theDataSYMB: return -13
case .theDataMISC: return -10
default: return -9.9
}
}
}
}

extension vChewingLM.LMInstantiator {
extension LMAssembly.LMInstantiator {
fileprivate static func querySQL(strStmt sqlQuery: String, coreColumn column: CoreColumn, handler: (String) -> Void) {
guard Self.ptrSQL != nil else { return }
performStatementSansResult { ptrStatement in
Expand Down Expand Up @@ -134,7 +136,9 @@ extension vChewingLM.LMInstantiator {
/// - parameters:
/// - key: 讀音索引鍵。
/// - column: 資料欄位。
func factoryUnigramsFor(key: String, column: CoreColumn) -> [Megrez.Unigram] {
func factoryUnigramsFor(
key: String, column: LMAssembly.LMInstantiator.CoreColumn
) -> [Megrez.Unigram] {
if key == "_punctuation_list" { return [] }
var grams: [Megrez.Unigram] = []
var gramsHW: [Megrez.Unigram] = []
Expand Down Expand Up @@ -210,7 +214,7 @@ extension vChewingLM.LMInstantiator {
}
}

private extension vChewingLM.LMInstantiator {
private extension LMAssembly.LMInstantiator {
/// 內部函式,用以將注音讀音索引鍵進行加密。
///
/// 使用這種加密字串作為索引鍵,可以增加對 json 資料庫的存取速度。
Expand Down Expand Up @@ -258,7 +262,7 @@ private extension vChewingLM.LMInstantiator {
]
}

public extension vChewingLM.LMInstantiator {
public extension LMAssembly.LMInstantiator {
@discardableResult static func connectToTestSQLDB() -> Bool {
Self.connectSQLDB(dbPath: #":memory:"#) && sqlTestCoreLMData.runAsSQLExec(dbPointer: &ptrSQL)
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
// (c) 2021 and onwards The vChewing Project (MIT-NTL License).
// ====================
// This code is released under the MIT license (SPDX-License-Identifier: MIT)
// ... with NTL restriction stating that:
// No trademark license is granted to use the trade names, trademarks, service
// marks, or product names of Contributor, except as required to fulfill notice
// requirements defined in MIT License.

import Foundation
import Megrez

public extension LMAssembly.LMInstantiator {
func performUOMObservation(
walkedBefore: [Megrez.Node],
walkedAfter: [Megrez.Node],
cursor: Int,
timestamp: Double,
saveCallback: (() -> Void)? = nil
) {
lmUserOverride.performObservation(
walkedBefore: walkedBefore,
walkedAfter: walkedAfter,
cursor: cursor,
timestamp: timestamp,
saveCallback: saveCallback
)
}

func fetchUOMSuggestion(
currentWalk: [Megrez.Node],
cursor: Int,
timestamp: Double
) -> LMAssembly.OverrideSuggestion {
lmUserOverride.fetchSuggestion(
currentWalk: currentWalk,
cursor: cursor,
timestamp: timestamp
)
}

func loadUOMData(fromURL fileURL: URL? = nil) {
lmUserOverride.loadData(fromURL: fileURL)
}

func saveUOMData(toURL fileURL: URL? = nil) {
lmUserOverride.saveData(toURL: fileURL)
}

func clearUOMData(withURL fileURL: URL? = nil) {
lmUserOverride.clearData(withURL: fileURL)
}

func bleachSpecifiedUOMSuggestions(targets: [String], saveCallback: (() -> Void)? = nil) {
lmUserOverride.bleachSpecifiedSuggestions(targets: targets, saveCallback: saveCallback)
}

func bleachUOMUnigrams(saveCallback: (() -> Void)? = nil) {
lmUserOverride.bleachUnigrams(saveCallback: saveCallback)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ import Megrez
import PinyinPhonaConverter
import Shared

public extension vChewingLM {
@frozen struct LMAssociates {
extension LMAssembly {
struct LMAssociates {
public private(set) var filePath: String?
var rangeMap: [String: [(Range<String.Index>, Int)]] = [:]
var strData: String = ""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@ import LineReader
import Megrez
import Shared

public extension vChewingLM {
extension LMAssembly {
/// 磁帶模組,用來方便使用者自行擴充字根輸入法。
@frozen struct LMCassette {
struct LMCassette {
public private(set) var filePath: String?
public private(set) var nameShort: String = ""
public private(set) var nameENG: String = ""
Expand Down Expand Up @@ -45,7 +45,7 @@ public extension vChewingLM {
}
}

public extension vChewingLM.LMCassette {
extension LMAssembly.LMCassette {
/// 計算頻率時要用到的東西 - fscale
private static let fscale = 2.7
/// 萬用花牌字符,哪怕花牌鍵仍不可用。
Expand Down Expand Up @@ -86,7 +86,7 @@ public extension vChewingLM.LMCassette {
if FileManager.default.fileExists(atPath: path) {
do {
guard let fileHandle = FileHandle(forReadingAtPath: path) else {
throw vChewingLM.FileErrors.fileHandleError("")
throw LMAssembly.FileErrors.fileHandleError("")
}
let lineReader = try LineReader(file: fileHandle)
var theMaxKeyLength = 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@ import Megrez
import PinyinPhonaConverter
import Shared

public extension vChewingLM {
extension LMAssembly {
/// 與之前的 LMCore 不同,LMCoreEX 不在辭典內記錄實體,而是記錄 range 範圍。
/// 需要資料的時候,直接拿 range 去 strData 取資料。
/// 資料記錄原理與上游 C++ 的 ParselessLM 差不多,但用的是 Swift 原生手段。
/// 主要時間消耗仍在 For 迴圈,但這個算法可以顯著減少記憶體佔用。
@frozen struct LMCoreEX {
struct LMCoreEX {
public private(set) var filePath: String?
/// 資料庫辭典。索引內容為注音字串,資料內容則為字串首尾範圍、方便自 strData 取資料。
var rangeMap: [String: [Range<String.Index>]] = [:]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import Foundation
import Shared

public extension vChewingLM {
public extension LMAssembly {
@frozen struct LMPlainBopomofo {
public private(set) var filePath: String?
var dataMap: [String: String] = [:]
Expand All @@ -29,13 +29,8 @@ public extension vChewingLM {

do {
let rawData = try Data(contentsOf: URL(fileURLWithPath: path))
if let rawJSON = try? JSONSerialization.jsonObject(with: rawData) as? [String: String] {
dataMap = rawJSON
} else {
filePath = oldPath
vCLog("↑ Exception happened when reading JSON file at: \(path).")
return false
}
let rawJSON = try JSONDecoder().decode([String: String].self, from: rawData)
dataMap = rawJSON
} catch {
filePath = oldPath
vCLog("\(error)")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@

import Shared

public extension vChewingLM {
@frozen struct LMReplacements {
extension LMAssembly {
struct LMReplacements {
public private(set) var filePath: String?
var rangeMap: [String: Range<String.Index>] = [:]
var strData: String = ""
Expand Down
Loading

0 comments on commit e44843e

Please sign in to comment.