Skip to content

Commit

Permalink
LMAssembly // Integrate EtenDOS SCPC data into the codebase.
Browse files Browse the repository at this point in the history
  • Loading branch information
ShikiSuen committed Feb 23, 2024
1 parent e44843e commit 3ebb5f2
Show file tree
Hide file tree
Showing 11 changed files with 1,396 additions and 89 deletions.
4 changes: 0 additions & 4 deletions Packages/vChewing_LangModelAssembly/Package.swift
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,6 @@ let package = Package(
.product(name: "Megrez", package: "vChewing_Megrez"),
.product(name: "Shared", package: "vChewing_Shared"),
.product(name: "PinyinPhonaConverter", package: "vChewing_PinyinPhonaConverter"),
],
resources: [
.process("Resources/sequenceDataFromEtenDOS-chs.json"),
.process("Resources/sequenceDataFromEtenDOS-cht.json"),
]
),
.testTarget(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,9 @@ public extension LMAssembly {
lmUserOverride = .init(dataURL: uomDataURL)
}

public func setOptions(handler: (inout Config) -> Void) {
@discardableResult public func setOptions(handler: (inout Config) -> Void) -> LMInstantiator {
handler(&config)
return self
}

@discardableResult public static func connectSQLDB(dbPath: String, dropPreviousConnection: Bool = true) -> Bool {
Expand Down Expand Up @@ -97,6 +98,7 @@ public extension LMAssembly {

// 磁帶資料模組。「currentCassette」對外唯讀,僅用來讀取磁帶本身的中繼資料(Metadata)。
static var lmCassette = LMCassette()
static var lmPlainBopomofo = LMPlainBopomofo()

// 聲明使用者語言模組。
// 使用者語言模組使用多執行緒的話,可能會導致一些問題。有時間再仔細排查看看。
Expand All @@ -111,7 +113,6 @@ public extension LMAssembly {
)
var lmReplacements = LMReplacements()
var lmAssociates = LMAssociates()
var lmPlainBopomofo = LMPlainBopomofo()

// 半衰记忆模组
var lmUserOverride: LMUserOverride
Expand Down Expand Up @@ -189,23 +190,6 @@ public extension LMAssembly {
}
}

public func loadSCPCSequencesData() {
let fileName = !isCHS ? "sequenceDataFromEtenDOS-cht" : "sequenceDataFromEtenDOS-chs"
guard let path = Bundle.module.path(forResource: fileName, ofType: "json") else {
vCLog("lmPlainBopomofo: File name access failure: \(fileName)")
return
}
DispatchQueue.main.async {
if FileManager.default.isReadableFile(atPath: path) {
self.lmPlainBopomofo.clear()
self.lmPlainBopomofo.open(path)
vCLog("lmPlainBopomofo: \(self.lmPlainBopomofo.count) entries of data loaded from: \(path)")
} else {
vCLog("lmPlainBopomofo: File access failure: \(path)")
}
}
}

public var isCassetteDataLoaded: Bool { Self.lmCassette.isLoaded }
public static func loadCassetteData(path: String) {
DispatchQueue.main.async {
Expand Down Expand Up @@ -340,7 +324,9 @@ public extension LMAssembly {

// 如果有檢測到使用者自訂逐字選字語料庫內的相關資料的話,在這裡先插入。
if config.isSCPCEnabled {
rawAllUnigrams += lmPlainBopomofo.valuesFor(key: keyChain).map { Megrez.Unigram(value: $0, score: 0) }
rawAllUnigrams += Self.lmPlainBopomofo.valuesFor(key: keyChain, isCHS: isCHS).map {
Megrez.Unigram(value: $0, score: 0)
}
}

// 用 reversed 指令讓使用者語彙檔案內的詞條優先順序隨著行數增加而逐漸增高。
Expand Down

This file was deleted.

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -9,60 +9,35 @@
import Foundation
import Shared

public extension LMAssembly {
@frozen struct LMPlainBopomofo {
public private(set) var filePath: String?
var dataMap: [String: String] = [:]
extension LMAssembly {
struct LMPlainBopomofo {
@usableFromInline typealias DataMap = [String: [String: String]]
let dataMap: DataMap

public var count: Int { dataMap.count }

public init() {
dataMap = [:]
}

public var isLoaded: Bool { !dataMap.isEmpty }

@discardableResult public mutating func open(_ path: String) -> Bool {
if isLoaded { return false }
let oldPath = filePath
filePath = nil

do {
let rawData = try Data(contentsOf: URL(fileURLWithPath: path))
let rawJSON = try JSONDecoder().decode([String: String].self, from: rawData)
let rawData = jsnEtenDosSequence.data(using: .utf8) ?? .init([])
let rawJSON = try JSONDecoder().decode([String: [String: String]].self, from: rawData)
dataMap = rawJSON
} catch {
filePath = oldPath
vCLog("\(error)")
vCLog("↑ Exception happened when reading JSON file at: \(path).")
return false
vCLog("↑ Exception happened when parsing raw JSON sequence data from vChewing LMAssembly.")
dataMap = [:]
}

filePath = path
return true
}

public mutating func clear() {
filePath = nil
dataMap.removeAll()
}

public func saveData() {
guard let filePath = filePath, let plistURL = URL(string: filePath) else { return }
do {
let plistData = try PropertyListSerialization.data(fromPropertyList: dataMap, format: .binary, options: 0)
try plistData.write(to: plistURL)
} catch {
vCLog("Failed to save current database to: \(filePath)")
}
}
public var isLoaded: Bool { !dataMap.isEmpty }

public func valuesFor(key: String) -> [String] {
public func valuesFor(key: String, isCHS: Bool) -> [String] {
var pairs: [String] = []
if let arrRangeRecords: String = dataMap[key]?.trimmingCharacters(in: .newlines) {
let subKey = isCHS ? "S" : "T"
if let arrRangeRecords: String = dataMap[key]?[subKey] {
pairs.append(contentsOf: arrRangeRecords.map(\.description))
}
return pairs.deduplicated
// 這裡不做去重複處理,因為倚天中文系統注音排序適應者們已經形成了肌肉記憶。
return pairs
}

public func hasValuesFor(key: String) -> Bool { dataMap.keys.contains(key) }
Expand Down
Loading

0 comments on commit 3ebb5f2

Please sign in to comment.