Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FEATURE/xml_to_json #7

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 72 additions & 0 deletions collector/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions collector/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,15 @@
"dependencies": {
"@aws-sdk/client-cloudfront": "^3.352.0",
"@aws-sdk/client-s3": "^3.352.0",
"@types/xml2js": "^0.4.11",
"@xmldom/xmldom": "^0.8.8",
"ansi-colors": "^4.1.3",
"cldr-localenames-full": "^43.1.0",
"csv-parse": "^5.4.0",
"html-minifier-terser": "^7.2.0",
"lodash-es": "^4.17.21",
"node-stream-zip": "^1.15.0",
"xml2js": "^0.6.0",
"xpath": "^0.0.32",
"xslt3": "^2.5.0",
"yaml": "^2.3.1",
Expand Down
9 changes: 8 additions & 1 deletion collector/src/parts/content.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import {extract_meta} from './usx.js'
import {update_manifest} from './manifest.js'
import {concurrent, PKG_PATH, read_json, read_dir} from './utils.js'
import type {TranslationSourceMeta, BookExtracts} from './types'
import { convert_to_json } from './xml_to_json.js'


export async function update_source(trans_id?:string){
Expand Down Expand Up @@ -169,7 +170,7 @@ async function _update_dist_single(id:string){
}

// Ensure dist dirs exist
for (const format of ['usx', 'usfm', 'html', 'txt']){
for (const format of ['usx', 'usfm', 'html', 'txt', 'json']){
fs.mkdirSync(join(dist_dir, format), {recursive: true})
}

Expand Down Expand Up @@ -207,6 +208,7 @@ async function _update_dist_single(id:string){
const src = join(usx_dir, `${book}.usx`)
const dst_html = join(dist_dir, 'html', `${book}.html`)
const dst_txt = join(dist_dir, 'txt', `${book}.txt`)
const dst_json = join(dist_dir, 'json', `${book}.json`)

// Convert to HTML if doesn't exist yet
if (!fs.existsSync(dst_html)){
Expand All @@ -225,5 +227,10 @@ async function _update_dist_single(id:string){
if (!fs.existsSync(dst_txt)){
execSync(`${xslt3} -xsl:${xsl_template_txt} -s:${src} -o:${dst_txt}`)
}

// Convert to json if doesn't exist yet
if (!fs.existsSync(dst_json)){
convert_to_json(fs.readFileSync(src, 'utf-8'), dst_json)
}
}
}
186 changes: 186 additions & 0 deletions collector/src/parts/xml_to_json.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
import { parseString } from "xml2js"
import { writeFileSync } from 'fs'

interface IJsonFormat {
chapters: {
[chapterId: string]: string[]
}
headers: {
chapter: number
verse: number
text: string
}[]
}

interface ICurrentIndex {
chapter: number
verse: number
text: string
header: string | undefined
}

interface IRow {
_: string
verse?: { $: { sid?: string; eid?: string; } }[]
$: {
style: string;
}
}

function resetIndex(chapter: number, verse: number, text?: string, header?: string): ICurrentIndex {
return {
chapter,
verse,
text: text ?? "",
header
};
}

export function convert_to_json(xmlString: string, filePath: string): void {
const trimmed = removeXmlElements(xmlString, ['char'])
parseString(trimmed, function (err, result) {
if (err) {
console.log(`err ${filePath}`, err);
}
const paragraphs = result.usx.para;

const bookContext: { data: IJsonFormat; currentIndex: ICurrentIndex } = paragraphs.reduce((acc: { data: IJsonFormat; currentIndex: ICurrentIndex }, next: IRow): { data: IJsonFormat; currentIndex: ICurrentIndex } => {
// if is a row with a verse tag, first or last in the verse
if (next.verse) {
const nextIndex = getVerseIndex(next.verse, acc.currentIndex)
return handleVerseRow(acc, nextIndex, next)
}

// if is a header
if (next.$.style === 's1') {
acc.currentIndex.header = getTextString(next);
return acc;
}

// if is another line within the same verse
const verseStyles = ["li1"];
if (verseStyles.includes(next.$.style)) {
acc.currentIndex.text = `${acc.currentIndex.text}\n${getTextString(next)}`
return acc;
}

// if is a line break
if (next.$.style === 'b') {
// if there is a header
if (acc.currentIndex.header !== undefined) {
acc.currentIndex.header = `${acc.currentIndex.header}\n`;
} else {
// if there is no header
acc.currentIndex.text = `${acc.currentIndex.text}\n`;
}

}

// ignore the line, contains non verse elements
return acc
}, {
data: {
chapters: {},
headers: []
},
currentIndex: resetIndex(0, 0)
});

// process the last line
const bookJson = writeVerse(bookContext)

writeFileSync(filePath, JSON.stringify(bookJson.data))
});
}

function getVerseIndex(verseTags: { $: { sid?: string; eid?: string; } }[], currentIndex: ICurrentIndex): { chapter: number; verse: number; } {
const verse = verseTags.find(element => !!element.$.sid || !!element.$.eid)
if (!verse) {
return {
chapter: currentIndex.chapter,
verse: currentIndex.verse
}
}

const verseId = verse.$.sid ?? verse.$.eid
const index = verseId?.split(" ")[1]?.split(":");
return {
chapter: Number(index?.[0]),
verse: Number(index?.[1])
}
}

function getTextString(row: IRow): string {
return row._
}

// this is needed because of the way chars are handled
const removeXmlElements = (hay: string, elements: string[]) =>{
let ret = hay

for (const item of elements) {
const regexp1 = new RegExp(`<${item}>`, "gs");
ret = ret.replace(regexp1, "")
const regexp2 = new RegExp(`<${item} [^<>].*?>`, "gs");
ret = ret.replace(regexp2, "")
const regexp3 = new RegExp(`</${item}>`, "gs");
ret = ret.replace(regexp3, "")
}

return ret
}

function addElementsToArray<T>(arr: T[], endIndex: number, element: T): T[] {
if (endIndex <= arr.length) {
return arr; // Return the current array if the index is equal to or lower than the array length
}

const newArray = [...arr]; // Create a shallow copy of the original array

for (let i = arr.length; i < endIndex; i++) {
newArray[i] = element; // Fill the new array with the given element
}

return newArray;
}

function writeVerse(currentState: { data: IJsonFormat; currentIndex: ICurrentIndex }): { data: IJsonFormat; currentIndex: ICurrentIndex } {
// verse has changed
const currentVerses = (currentState.data.chapters[currentState.currentIndex.chapter] ?? []);

// if verse is > than expected, fill with empty strings
const verses = addElementsToArray(currentVerses, currentState.currentIndex.verse - 1, "").concat(currentState.currentIndex.text)

currentState.data.chapters = {
...currentState.data.chapters,
[currentState.currentIndex.chapter]: verses
}

return currentState;
}

function handleVerseRow(currentState: { data: IJsonFormat; currentIndex: ICurrentIndex }, nextIndex: { chapter: number; verse: number; }, row: IRow): { data: IJsonFormat; currentIndex: ICurrentIndex } {
// ignore the initial state
if (currentState.currentIndex.chapter !== 0) {
// still the same verse
if (nextIndex.chapter === currentState.currentIndex.chapter && nextIndex.verse === currentState.currentIndex.verse) {
currentState.currentIndex.text = `${currentState.currentIndex.text}\n${getTextString(row)}`
return currentState
}

currentState = writeVerse(currentState)
}

// if there is a header, belongs to the new verse
if (currentState.currentIndex.header) {
currentState.data.headers.push({
chapter: nextIndex.chapter,
verse: nextIndex.verse,
text: currentState.currentIndex.header
})
}

// keep the current verse as the context until we reach the next one
currentState.currentIndex = resetIndex(nextIndex.chapter, nextIndex.verse, getTextString(row))
return currentState
}