From 253d927fd72f60e46704249dad49e3f3a442aebb Mon Sep 17 00:00:00 2001 From: Colin Leong <--unset> Date: Thu, 20 Jun 2024 12:36:38 -0400 Subject: [PATCH 1/5] CDL: add deprecated status to ATIS --- src/datasets/ATIS.json | 1 + 1 file changed, 1 insertion(+) diff --git a/src/datasets/ATIS.json b/src/datasets/ATIS.json index eb871d6..d098dc6 100644 --- a/src/datasets/ATIS.json +++ b/src/datasets/ATIS.json @@ -10,6 +10,7 @@ "#items": 292, "#samples": "595 Sentences ", "#signers": null, + "status": "deprecated", "license": null, "licenseUrl": null } From 06d4c85f87aed3010f5ff8cb290ac7d0baeb7461 Mon Sep 17 00:00:00 2001 From: Colin Leong <--unset> Date: Thu, 20 Jun 2024 12:37:36 -0400 Subject: [PATCH 2/5] CDL: add feature to check for deprecated datasets and skip them. Also comments on datasets.js --- src/datasets.js | 119 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 99 insertions(+), 20 deletions(-) diff --git a/src/datasets.js b/src/datasets.js index 4fc0ef2..63be65f 100644 --- a/src/datasets.js +++ b/src/datasets.js @@ -1,27 +1,44 @@ +// CDL: added comments via discussion with ChatGPT 4o: https://chatgpt.com/share/3acd13d8-ddf8-4b71-95af-b7904f806b39 +// then manually spot-checked the ones I wasn't sure about. +// "*" means relevant docs at the end + + +// Import the NodeJS "file system" module * const fs = require('fs'); -function link(title, href) { - let s = title; +// Function to create a markdown link +function createMarkdownLink(title, href) { + let s = title; // Initialize link text with title + // If href is provided, format the string as a markdown link if (href) { s = `[${s}](${href})`; } - return s; + return s; // Return the formatted link or title } +// Function to sanitize text * function sanitize(text) { + // CDL: return unchanged if falsy. Later, falsy values are replaced with "" if (!text) { return text; } - if(typeof text === 'number') { + // If text is a number, convert it to a string + if (typeof text === 'number') { return String(text); } - return text.replace(/>/, "\\>") + // Replace '>' with escaped version + return text.replace(/>/, "\\>"); } +// Function to get an icon for a feature function getIcon(feature) { + // Split the feature into type and specificity + // CDL: this means that things like pose:OpenPose and pose:MediaPipe get the same icon. const [type, specificity] = feature.split(":"); + + // Dictionary mapping feature types to emoji const dict = { 'video': '🎥', 'pose': '👋', @@ -31,47 +48,109 @@ function getIcon(feature) { 'text': '📜', 'speech': '🔊', }; + + // Return an HTML span element with the appropriate emoji return `${dict[type]}` || "TODO"; + // Alternative return statement for using image icons // return `![${type}](assets/icons/${type}.png "${feature}")`; } +// Function to print a table row function printRow(row) { - console.log('|', row.join(' | '), '|'); + console.log('|', row.join(' | '), '|'); // Join row elements with ' | ' and print } - +// Define the path to the datasets directory const PATH = "src/datasets/"; -const datasets = fs.readdirSync(PATH) - .map(fName => String(fs.readFileSync(PATH + fName))) - .map(d => JSON.parse(d)) - .sort((a, b) => a.pub.name.toLowerCase() > b.pub.name.toLowerCase() ? 1 : -1); - +// Read the datasets directory and process each file * +// Colin: => means "Arrow function"* +const datasets = fs.readdirSync(PATH) // Read all filenames in the directory * + .map(fName => String(fs.readFileSync(PATH + fName))) // Read each file's content and convert to string * + .map(d => JSON.parse(d)) // Parse the JSON content. * + .sort((a, b) => a.pub.name.toLowerCase() > b.pub.name.toLowerCase() ? 1 : -1); // Sort datasets by publication name * +// Define column headers and their lengths for the table const columns = ['Dataset', 'Publication', 'Language', 'Features', '#Signs', '#Samples', '#Signers', 'License']; -const lengths = [4, 7, 3, 2, 2, 5, 2, 5] -// console.log('') +const lengths = [4, 7, 3, 2, 2, 5, 2, 5]; + +// Print the header row printRow(columns); // Header row -console.log('|' + lengths.map((l) => new Array(l).fill('-').join('')).join(' | ') + '|'); // Divider row +// Print the divider row with dashes +console.log('|' + lengths.map((l) => new Array(l).fill('-').join('')).join(' | ') + '|'); +// Define an emoji for download link const downloadEmoji = '💾'; +// Iterate over each dataset to print its details for (const dataset of datasets) { - let title = link(dataset.pub.name, dataset.pub.url); + // CDL: should we even include it? + if(dataset.status === "deprecated"){ + continue; //skip to the next one + } + + // Create the title link for the dataset + let title = createMarkdownLink(dataset.pub.name, dataset.pub.url); + + // If the dataset has a loader, add a download link if (dataset.loader) { const sld = 'https://github.com/sign-language-processing/datasets/tree/master/sign_language_datasets/datasets/' + dataset.loader; - title += ' ' + link(downloadEmoji, sld); + title += ' ' + createMarkdownLink(downloadEmoji, sld); } + + // Create a row with the dataset details + // CDL: falsy (empty, null, etc) values just replaced with blank strings const row = [ title, - dataset.pub.publication ? `@${dataset.pub.publication}` : dataset.pub.year || "", + dataset.pub.publication ? `@${dataset.pub.publication}` : dataset.pub.year || "", // add citation syntax. Make/Pandoc later replace with citation dataset.language, dataset["features"].length ? dataset["features"].map(getIcon).join("") : "TODO", - dataset["#items"] ? dataset["#items"].toLocaleString('en-US') : "", + dataset["#items"] ? dataset["#items"].toLocaleString('en-US') : "", // if there is an items field, format to standard sanitize(dataset["#samples"]) || "", dataset["#signers"] || "", - link(dataset.license, dataset.licenseUrl) + createMarkdownLink(dataset.license, dataset.licenseUrl) ]; + + // Print the dataset row printRow(row); } + +// JavaScript notes for non-JS programmers + +// Require: similar to "include" or "import" +// https://www.freecodecamp.org/news/requiring-modules-in-node-js-everything-you-need-to-know-e7fbd119be8/ + +// Falsy: Includes text with null value, empty strings, etc. +// https://www.freecodecamp.org/news/falsy-values-in-javascript/ +// https://developer.mozilla.org/en-US/docs/Glossary/Falsy + +// File system methods +// https://www.geeksforgeeks.org/node-js-fs-readdirsync-method/ +// https://www.geeksforgeeks.org/node-js-fs-readfilesync-method/ + +// Locale String: helps you reformat to a standard format. +// e.g. 1234 -> 1,234 +// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Number/toLocaleString + + +// Sorting an array of strings +// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/sort + +// JSON +// apparently in JavaScript, support for JavaScript Object Notations is built-in. Neat! +// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/JSON +// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/JSON/parse + +// JS ternary operator ? +// Basically an if/else statement. +// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/Conditional_operator + +// Arrow functions => +// kinda like a lambda function. For when you want to make a function but NOT name it/keep it around for later +// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Functions/Arrow_functions + +// map: +// Used above to run the same (anonymous) function on everything in the array +// "The map() method of Array instances creates a new array populated with the results of calling a provided function on every element in the calling array." +// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/map \ No newline at end of file From 2500d0662374707a297d032edc9f26c406f543fa Mon Sep 17 00:00:00 2001 From: Colin Leong <122366389+cleong110@users.noreply.github.com> Date: Thu, 20 Jun 2024 16:56:34 -0400 Subject: [PATCH 3/5] CDL: comments on datasets.js v2 --- src/datasets.js | 88 +++++++++---------------------------------------- 1 file changed, 15 insertions(+), 73 deletions(-) diff --git a/src/datasets.js b/src/datasets.js index 63be65f..ebb9d49 100644 --- a/src/datasets.js +++ b/src/datasets.js @@ -1,24 +1,17 @@ -// CDL: added comments via discussion with ChatGPT 4o: https://chatgpt.com/share/3acd13d8-ddf8-4b71-95af-b7904f806b39 -// then manually spot-checked the ones I wasn't sure about. -// "*" means relevant docs at the end - - -// Import the NodeJS "file system" module * const fs = require('fs'); -// Function to create a markdown link +// If href is provided, format the string as a markdown link function createMarkdownLink(title, href) { - let s = title; // Initialize link text with title + let s = title; - // If href is provided, format the string as a markdown link + if (href) { s = `[${s}](${href})`; } - return s; // Return the formatted link or title + return s; } -// Function to sanitize text * function sanitize(text) { // CDL: return unchanged if falsy. Later, falsy values are replaced with "" if (!text) { @@ -32,7 +25,7 @@ function sanitize(text) { return text.replace(/>/, "\\>"); } -// Function to get an icon for a feature +// Colin: gets the proper emoji icon for dataset features. function getIcon(feature) { // Split the feature into type and specificity // CDL: this means that things like pose:OpenPose and pose:MediaPipe get the same icon. @@ -49,108 +42,57 @@ function getIcon(feature) { 'speech': '🔊', }; - // Return an HTML span element with the appropriate emoji return `${dict[type]}` || "TODO"; - // Alternative return statement for using image icons // return `![${type}](assets/icons/${type}.png "${feature}")`; } -// Function to print a table row function printRow(row) { - console.log('|', row.join(' | '), '|'); // Join row elements with ' | ' and print + console.log('|', row.join(' | '), '|'); } -// Define the path to the datasets directory const PATH = "src/datasets/"; -// Read the datasets directory and process each file * -// Colin: => means "Arrow function"* const datasets = fs.readdirSync(PATH) // Read all filenames in the directory * .map(fName => String(fs.readFileSync(PATH + fName))) // Read each file's content and convert to string * .map(d => JSON.parse(d)) // Parse the JSON content. * .sort((a, b) => a.pub.name.toLowerCase() > b.pub.name.toLowerCase() ? 1 : -1); // Sort datasets by publication name * -// Define column headers and their lengths for the table const columns = ['Dataset', 'Publication', 'Language', 'Features', '#Signs', '#Samples', '#Signers', 'License']; const lengths = [4, 7, 3, 2, 2, 5, 2, 5]; -// Print the header row -printRow(columns); // Header row -// Print the divider row with dashes + +printRow(columns); + console.log('|' + lengths.map((l) => new Array(l).fill('-').join('')).join(' | ') + '|'); -// Define an emoji for download link const downloadEmoji = '💾'; -// Iterate over each dataset to print its details for (const dataset of datasets) { - // CDL: should we even include it? + if(dataset.status === "deprecated"){ continue; //skip to the next one } - - // Create the title link for the dataset + let title = createMarkdownLink(dataset.pub.name, dataset.pub.url); - // If the dataset has a loader, add a download link if (dataset.loader) { const sld = 'https://github.com/sign-language-processing/datasets/tree/master/sign_language_datasets/datasets/' + dataset.loader; title += ' ' + createMarkdownLink(downloadEmoji, sld); } - // Create a row with the dataset details - // CDL: falsy (empty, null, etc) values just replaced with blank strings + // CDL: note - falsy (empty, null, etc) values just replaced with blank strings const row = [ title, - dataset.pub.publication ? `@${dataset.pub.publication}` : dataset.pub.year || "", // add citation syntax. Make/Pandoc later replace with citation + dataset.pub.publication ? `@${dataset.pub.publication}` : dataset.pub.year || "", // add citation syntax @citationkey. Make/Pandoc later replace with citation dataset.language, dataset["features"].length ? dataset["features"].map(getIcon).join("") : "TODO", - dataset["#items"] ? dataset["#items"].toLocaleString('en-US') : "", // if there is an items field, format to standard + dataset["#items"] ? dataset["#items"].toLocaleString('en-US') : "", sanitize(dataset["#samples"]) || "", dataset["#signers"] || "", createMarkdownLink(dataset.license, dataset.licenseUrl) ]; - // Print the dataset row + printRow(row); } - -// JavaScript notes for non-JS programmers - -// Require: similar to "include" or "import" -// https://www.freecodecamp.org/news/requiring-modules-in-node-js-everything-you-need-to-know-e7fbd119be8/ - -// Falsy: Includes text with null value, empty strings, etc. -// https://www.freecodecamp.org/news/falsy-values-in-javascript/ -// https://developer.mozilla.org/en-US/docs/Glossary/Falsy - -// File system methods -// https://www.geeksforgeeks.org/node-js-fs-readdirsync-method/ -// https://www.geeksforgeeks.org/node-js-fs-readfilesync-method/ - -// Locale String: helps you reformat to a standard format. -// e.g. 1234 -> 1,234 -// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Number/toLocaleString - - -// Sorting an array of strings -// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/sort - -// JSON -// apparently in JavaScript, support for JavaScript Object Notations is built-in. Neat! -// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/JSON -// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/JSON/parse - -// JS ternary operator ? -// Basically an if/else statement. -// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/Conditional_operator - -// Arrow functions => -// kinda like a lambda function. For when you want to make a function but NOT name it/keep it around for later -// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Functions/Arrow_functions - -// map: -// Used above to run the same (anonymous) function on everything in the array -// "The map() method of Array instances creates a new array populated with the results of calling a provided function on every element in the calling array." -// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/map \ No newline at end of file From a3f37f4c45392ba8a61573069bd3ce4588f80304 Mon Sep 17 00:00:00 2001 From: Colin Leong <122366389+cleong110@users.noreply.github.com> Date: Fri, 21 Jun 2024 09:39:14 -0400 Subject: [PATCH 4/5] CDL: more commenting improvements! Thanks, Amit. --- src/datasets.js | 38 ++++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/src/datasets.js b/src/datasets.js index ebb9d49..bc7fe92 100644 --- a/src/datasets.js +++ b/src/datasets.js @@ -2,22 +2,21 @@ const fs = require('fs'); // If href is provided, format the string as a markdown link function createMarkdownLink(title, href) { - let s = title; + let s = title; + - if (href) { s = `[${s}](${href})`; } - return s; + return s; } function sanitize(text) { - // CDL: return unchanged if falsy. Later, falsy values are replaced with "" if (!text) { return text; } - // If text is a number, convert it to a string + if (typeof text === 'number') { return String(text); } @@ -30,9 +29,8 @@ function getIcon(feature) { // Split the feature into type and specificity // CDL: this means that things like pose:OpenPose and pose:MediaPipe get the same icon. const [type, specificity] = feature.split(":"); - - // Dictionary mapping feature types to emoji - const dict = { + + const featureEmojiDict = { 'video': '🎥', 'pose': '👋', 'mouthing': '👄', @@ -41,13 +39,13 @@ function getIcon(feature) { 'text': '📜', 'speech': '🔊', }; - - return `${dict[type]}` || "TODO"; + + return `${featureEmojiDict[type]}` || "TODO"; // return `![${type}](assets/icons/${type}.png "${feature}")`; } function printRow(row) { - console.log('|', row.join(' | '), '|'); + console.log('|', row.join(' | '), '|'); } const PATH = "src/datasets/"; @@ -60,39 +58,39 @@ const datasets = fs.readdirSync(PATH) // Read all filenames in the directory * const columns = ['Dataset', 'Publication', 'Language', 'Features', '#Signs', '#Samples', '#Signers', 'License']; const lengths = [4, 7, 3, 2, 2, 5, 2, 5]; +printRow(columns); -printRow(columns); - +// divider row console.log('|' + lengths.map((l) => new Array(l).fill('-').join('')).join(' | ') + '|'); const downloadEmoji = '💾'; for (const dataset of datasets) { - if(dataset.status === "deprecated"){ + if (dataset.status === "deprecated") { continue; //skip to the next one } - + let title = createMarkdownLink(dataset.pub.name, dataset.pub.url); - + if (dataset.loader) { const sld = 'https://github.com/sign-language-processing/datasets/tree/master/sign_language_datasets/datasets/' + dataset.loader; title += ' ' + createMarkdownLink(downloadEmoji, sld); } - // CDL: note - falsy (empty, null, etc) values just replaced with blank strings + // note - falsy (empty, null, etc) values just replaced with blank strings const row = [ title, dataset.pub.publication ? `@${dataset.pub.publication}` : dataset.pub.year || "", // add citation syntax @citationkey. Make/Pandoc later replace with citation dataset.language, dataset["features"].length ? dataset["features"].map(getIcon).join("") : "TODO", - dataset["#items"] ? dataset["#items"].toLocaleString('en-US') : "", + dataset["#items"] ? dataset["#items"].toLocaleString('en-US') : "", sanitize(dataset["#samples"]) || "", dataset["#signers"] || "", createMarkdownLink(dataset.license, dataset.licenseUrl) ]; - - + + printRow(row); } From b563b41375e9ed9654a9be67a237e76d5a2cbd60 Mon Sep 17 00:00:00 2001 From: Colin Leong <122366389+cleong110@users.noreply.github.com> Date: Mon, 24 Jun 2024 11:51:01 -0400 Subject: [PATCH 5/5] CDL: fix a few more comments in datasets.js --- src/datasets.js | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/datasets.js b/src/datasets.js index bc7fe92..8b800b7 100644 --- a/src/datasets.js +++ b/src/datasets.js @@ -24,10 +24,9 @@ function sanitize(text) { return text.replace(/>/, "\\>"); } -// Colin: gets the proper emoji icon for dataset features. function getIcon(feature) { - // Split the feature into type and specificity - // CDL: this means that things like pose:OpenPose and pose:MediaPipe get the same icon. + // Split the feature (e.g. "pose:OpenPose") into type and specificity ("pose" and "OpenPose") + // allows various specific features with the same type (pose:OpenPose and pose:MediaPipe) to get the same icon. const [type, specificity] = feature.split(":"); const featureEmojiDict = { @@ -68,7 +67,7 @@ const downloadEmoji = '💾'; for (const dataset of datasets) { if (dataset.status === "deprecated") { - continue; //skip to the next one + continue; } let title = createMarkdownLink(dataset.pub.name, dataset.pub.url);