feat: move string utility methods from onedrive-support

adobe · Dec 6, 2024 · ecc6e82 · ecc6e82
1 parent 1c979b8
commit ecc6e82
Show file tree

Hide file tree

Showing 2 changed files with 249 additions and 1 deletion.
diff --git a/packages/helix-shared-string/src/string.js b/packages/helix-shared-string/src/string.js
@@ -59,3 +59,113 @@ export function multiline(str) {
     .map((l) => l.slice(prefixLen)) // discard prefixes
     .join('\n');
 }
+
+/*
+ * Copyright 2020 Adobe. All rights reserved.
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License. You may obtain a copy
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
+ * OF ANY KIND, either express or implied. See the License for the specific language
+ * governing permissions and limitations under the License.
+ */
+
+/**
+ * Splits the given name at the last '.', returning the extension and the base name.
+ * @param {string} name Filename
+ * @returns {string[]} Returns an array containing the base name and extension.
+ */
+export function splitByExtension(name) {
+  const idx = name.lastIndexOf('.');
+  const baseName = idx > 0 && idx < name.length - 1 ? name.substring(0, idx) : name;
+  const ext = idx > 0 && idx < name.length - 1 ? name.substring(idx + 1).toLowerCase() : '';
+  return [baseName, ext];
+}
+
+/**
+ * Sanitizes the given string by :
+ * - convert to lower case
+ * - normalize all unicode characters
+ * - replace all non-alphanumeric characters with a dash
+ * - remove all consecutive dashes
+ * - remove all leading and trailing dashes
+ *
+ * @param {string} name
+ * @returns {string} sanitized name
+ */
+export function sanitizeName(name) {
+  return name
+    .toLowerCase()
+    .normalize('NFD')
+    .replace(/[\u0300-\u036f]/g, '')
+    .replace(/[^a-z0-9]+/g, '-')
+    .replace(/^-|-$/g, '');
+}
+
+/**
+ * Sanitizes the file path by:
+ * - convert to lower case
+ * - normalize all unicode characters
+ * - replace all non-alphanumeric characters with a dash
+ * - remove all consecutive dashes
+ * - remove all leading and trailing dashes
+ *
+ * Note that only the basename of the file path is sanitized. i.e. The ancestor path and the
+ * extension is not affected.
+ *
+ * @param {string} filepath the file path
+ * @param {object} opts Options
+ * @param {boolean} [opts.ignoreExtension] if {@code true} ignores the extension
+ * @returns {string} sanitized file path
+ */
+export function sanitizePath(filepath, opts = {}) {
+  const idx = filepath.lastIndexOf('/') + 1;
+  const extIdx = opts.ignoreExtension ? -1 : filepath.lastIndexOf('.');
+  const pfx = filepath.substring(0, idx);
+  const basename = extIdx < idx ? filepath.substring(idx) : filepath.substring(idx, extIdx);
+  const ext = extIdx < idx ? '' : filepath.substring(extIdx);
+  const name = sanitizeName(basename);
+  return `${pfx}${name}${ext}`;
+}
+
+/**
+ * Compute the edit distance using a recursive algorithm. since we only expect to have relative
+ * short filenames, the algorithm shouldn't be too expensive.
+ *
+ * @param {string} s0 Input string
+ * @param {string} s1 Input string
+ * @returns {number|*}
+ */
+export function editDistance(s0, s1) {
+  // make sure that s0 length is greater than s1 length
+  if (s0.length < s1.length) {
+    const t = s1;
+    // eslint-disable-next-line no-param-reassign
+    s1 = s0;
+    // eslint-disable-next-line no-param-reassign
+    s0 = t;
+  }
+  const l0 = s0.length;
+  const l1 = s1.length;
+
+  // init first row
+  const resultMatrix = [[]];
+  for (let c = 0; c < l1 + 1; c += 1) {
+    resultMatrix[0][c] = c;
+  }
+  // fill out the distance matrix and find the best path
+  for (let i = 1; i < l0 + 1; i += 1) {
+    resultMatrix[i] = [i];
+    for (let j = 1; j < l1 + 1; j += 1) {
+      const replaceCost = (s0.charAt(i - 1) === s1.charAt(j - 1)) ? 0 : 1;
+      resultMatrix[i][j] = Math.min(
+        resultMatrix[i - 1][j] + 1, // insert
+        resultMatrix[i][j - 1] + 1, // remove
+        resultMatrix[i - 1][j - 1] + replaceCost,
+      );
+    }
+  }
+  return resultMatrix[l0][l1];
+}
diff --git a/packages/helix-shared-string/test/string.test.js b/packages/helix-shared-string/test/string.test.js
@@ -13,7 +13,9 @@
 /* eslint-env mocha */
 
 import assert from 'assert';
-import { multiline } from '../src/string.js';
+import {
+  multiline, editDistance, sanitizeName, sanitizePath, splitByExtension,
+} from '../src/string.js';
 
 describe('String tests', () => {
   it('multiline()', () => {
@@ -39,3 +41,139 @@ describe('String tests', () => {
   `);
   });
 });
+
+describe('splitByExtension Tests', () => {
+  it('extension split works for empty string', () => {
+    assert.deepStrictEqual(['', ''], splitByExtension(''));
+  });
+
+  it('extension split works for string w/o extension', () => {
+    assert.deepStrictEqual(['foo', ''], splitByExtension('foo'));
+  });
+
+  it('extension split works for string with extension', () => {
+    assert.deepStrictEqual(['foo', 'txt'], splitByExtension('foo.txt'));
+  });
+
+  it('extension split works for string with dots and extension', () => {
+    assert.deepStrictEqual(['foo.bar', 'txt'], splitByExtension('foo.bar.txt'));
+  });
+
+  it('extension split works for string ending with a dot', () => {
+    assert.deepStrictEqual(['foo.', ''], splitByExtension('foo.'));
+  });
+
+  it('extension split works for string starting with a dot', () => {
+    assert.deepStrictEqual(['.foo', ''], splitByExtension('.foo'));
+  });
+});
+
+describe('sanitize Tests', () => {
+  it('sanitize works for empty string', () => {
+    assert.strictEqual(sanitizeName(''), '');
+  });
+
+  it('sanitize transform string to lower case', () => {
+    assert.strictEqual(sanitizeName('MyDocument'), 'mydocument');
+  });
+
+  it('sanitize transforms non-alpha to dashes', () => {
+    assert.strictEqual(sanitizeName('My 2. Document'), 'my-2-document');
+  });
+
+  it('sanitize removes leading dashes', () => {
+    assert.strictEqual(sanitizeName('.My 2. Document'), 'my-2-document');
+  });
+
+  it('sanitize removes trailing dashes', () => {
+    assert.strictEqual(sanitizeName('.My 2. Document-'), 'my-2-document');
+  });
+
+  it('sanitize normalizes unicode', () => {
+    assert.strictEqual(sanitizeName('Föhren Smürd'), 'fohren-smurd');
+  });
+});
+
+describe('editDistance Tests', () => {
+  it('editDistances works for empty strings', () => {
+    assert.strictEqual(0, editDistance('', ''));
+  });
+
+  it('editDistances works for equal strings', () => {
+    assert.strictEqual(0, editDistance('foo', 'foo'));
+  });
+
+  it('editDistances works for appended characters', () => {
+    assert.strictEqual(3, editDistance('foo', 'foo123'));
+  });
+
+  it('editDistances works for removed characters from the end', () => {
+    assert.strictEqual(3, editDistance('foo123', 'foo'));
+  });
+
+  it('editDistances works for replaced characters', () => {
+    assert.strictEqual(3, editDistance('My Document', 'my-document'));
+  });
+
+  it('editDistances works for more complicate replacements', () => {
+    assert.strictEqual(5, editDistance('My 1. Document', 'my-1-document'));
+  });
+
+  it('editDistances works for more complicate replacements (2)', () => {
+    assert.strictEqual(10, editDistance('my-1-document', 'My 1. Document.docx'));
+  });
+
+  it('editDistances is reasonably fast for long names)', () => {
+    const t0 = Date.now();
+    assert.strictEqual(66, editDistance(
+      'my-1-document my-1-document my-1-document my-1-document my-1-document my-1-document my-1-document my-1-document my-1-document my-1-document my-1-document my-1-document ',
+      'My 1. Document My 1. Document My 1. Document My 1. Document My 1. Document My 1. Document My 1. Document My 1. Document My 1. Document My 1. Document My 1. Document .docx',
+    ));
+    const t1 = Date.now();
+    assert.ok(t1 - t0 < 100);
+  });
+});
+
+describe('sanitizePath Tests', () => {
+  it('sanitizePath works for empty string', () => {
+    assert.strictEqual(sanitizePath(''), '');
+  });
+
+  it('sanitizePath transform string to lower case', () => {
+    assert.strictEqual(sanitizePath('MyDocument'), 'mydocument');
+  });
+
+  it('sanitizePath can ignore extension', () => {
+    assert.strictEqual(sanitizePath('.MyDocument', {
+      ignoreExtension: true,
+    }), 'mydocument');
+  });
+
+  it('sanitizePath works with dots in path and no extension', () => {
+    assert.strictEqual(sanitizePath('/foo.bar/My Document'), '/foo.bar/my-document');
+  });
+
+  it('sanitizePath only transforms last path segment', () => {
+    assert.strictEqual(sanitizePath('/Untitled Folder/MyDocument'), '/Untitled Folder/mydocument');
+  });
+
+  it('sanitizePath only transforms root segment', () => {
+    assert.strictEqual(sanitizePath('/MyDocument'), '/mydocument');
+  });
+
+  it('sanitizePath transforms non-alpha to dashes', () => {
+    assert.strictEqual(sanitizePath('My 2. Document.docx'), 'my-2-document.docx');
+  });
+
+  it('sanitizePath removes leading dashes', () => {
+    assert.strictEqual(sanitizePath('.My 2. Document.docx'), 'my-2-document.docx');
+  });
+
+  it('sanitizePath removes trailing dashes', () => {
+    assert.strictEqual(sanitizePath('.My 2. Document!.docx'), 'my-2-document.docx');
+  });
+
+  it('sanitizePath normalizes unicode', () => {
+    assert.strictEqual(sanitizePath('Föhren Smürd'), 'fohren-smurd');
+  });
+});