ChakshuGautam · techsavvyash · Mar 9, 2024 · Jan 15, 2024 · Jan 15, 2024 · Jan 15, 2024
diff --git a/.github/workflows/db-tests.yml b/.github/workflows/db-tests.yml
@@ -13,7 +13,7 @@ jobs:
 
     strategy:
       matrix:
-        node-version: [14.x, 16.x, 18.x]
+        node-version: [16.x, 18.x]
 
     steps:
     - uses: actions/checkout@v3
@@ -22,4 +22,5 @@ jobs:
       with:
         node-version: ${{ matrix.node-version }}
     - run: npm i
+    - run: npm run generate
     - run: npm test
diff --git a/.gitignore b/.gitignore
@@ -130,4 +130,6 @@ dist
 .pnp.*
 
 
-**/.DS_Store
+**/.DS_Store
+
+student_large.csv
diff --git a/docs/tutorials/21.md b/docs/tutorials/21.md
@@ -0,0 +1,173 @@
+## Step 21: Add Approximate Counting using `HyperLogLog`
+
+### 21.1 First things first, create a utility to generate large files
+
+Since the effect of `HyperLogLog` is best seen on large files, we need to create a utility function which generates large files, let's say with `10_000_000` data points. To do this create a file named `generateLargeFile.js` in a `utils` folder and add the following logic to it.
+
+```js
+const fs = require('fs');
+const { faker, da } = require('@faker-js/faker');
+const { parse } = require('json2csv');
+
+async function generateLargeCSV(filename) {
+    let data = [];
+    for (let i = 1; i <= 10_000_000; i++) {
+        const record = {
+            id: i,
+            name: faker.person.firstName(),
+            age: faker.number.int({ min: 18, max: 100 }),
+        };
+        data.push(record);
+
+        let rows;
+        if (i % 500_000 === 0) {
+            console.log(`Generated ${i} records`);
+            if (!fs.existsSync(filename)) {
+                rows = parse(data, { header: true });
+            } else {
+                // Rows without headers.
+                rows = parse(data, { header: false });
+            }
+            fs.appendFileSync(filename, rows);
+            data = [];
+        }
+
+    }
+    // Append file function can create new file too.
+
+    // Always add new line if file already exists.
+    fs.appendFileSync(filename, "\r\n");
+}
+
+generateLargeCSV('student_large.csv')
+```
+
+### 21.2 Implement CSV reader for `HyperLogLog`
+
+Since `HyperLogLog` is a data structure which keeps data stored in a hashed format, we implement a separete CSV reader for it. Create a function named `readCSVforHLL` in your `csvStorage.js`.
+Sample logic for it can be found here:
+```js
+function readCSVForHLL(filePath, bitSampleSize = 12, digestSize = 128) {
+    const results = [];
+    var h = hll({ bitSampleSize: bitSampleSize, digestSize: digestSize });
+
+    return new Promise((resolve, reject) => {
+        fs.createReadStream(filePath)
+            .pipe(csv())
+            .on('data', (data) => h.insert(JSON.stringify(data)))
+            .on('end', () => {
+                resolve(h);
+            })
+            .on('error', (error) => {
+                reject(error);
+            });
+    });
+}
+```
+
+### 21.3 Update the `queryParser` implementation to identify `COUNT` and `APPROXIMATE_COUNT` as valid tokens
+
+Since our SQL queries will now be accepting the `COUNT` and `APPROXIMATE_COUNT` tokens as valid tokens, we need to update the logic of our parser to identify and process them accordingly. Update your `queryParser.js` with the required logic (regex) to identify that.
+
+
+### 21.4 Update the `executeSELECTQuery` function to add support for `COUNT` and `APPROXIMATE_COUNT` 
+
+Update the existing logic in the `executeSELECTQuery` function to identify and process the `COUNT` and `APPROXIMATE_COUNT` commands in your SQL query. 
+Some snippets that might be helpful are:
+```js
+// getting approx counts
+if (isApproximateCount && fields.length === 1 && fields[0] === 'COUNT(*)' && whereClauses.length === 0) {
+  let hll = await readCSVForHLL(`${table}.csv`);
+  return [{ 'APPROXIMATE_COUNT(*)': hll.estimate() }];
+}
+
+ // Distinct inside count - example "SELECT COUNT (DISTINCT student.name) FROM student"
+            if (isCountDistinct) {
+
+                if (isApproximateCount) {
+                    var h = hll({ bitSampleSize: 12, digestSize: 128 });
+                    orderedResults.forEach(row => h.insert(distinctFields.map(field => row[field]).join('|')));
+                    return [{ [`APPROXIMATE_${fields[0]}`]: h.estimate() }];
+                }
+                else {
+                    let distinctResults = [...new Map(orderedResults.map(item => [distinctFields.map(field => item[field]).join('|'), item])).values()];
+                    return [{ [fields[0]]: distinctResults.length }];
+                }
+            }
+```
+
+
+### 21.5 Write a test case for approximate count
+
+Since we are following `TDD` in this tutorial, we are going to be writing a test case to test our implementation now.
+Create a file named `approximateLargeFile.test.js` in your `tests` folder and add the following test cases:
+
+```js
+const fs = require('fs');
+const { executeSELECTQuery } = require('../src/queryExecuter');
+const jestConsole = console;
+
+beforeEach(() => {
+    global.console = require('console');
+});
+
+afterEach(() => {
+    global.console = jestConsole;
+});
+
+test('Large File Count(*) - Approximate and Exact', async () => {
+    // Test Exact Count
+
+    const startMemoryUsageExact = process.memoryUsage().heapUsed;
+    const startTimeExact = performance.now();
+
+    const queryExact = "SELECT COUNT(*) FROM student_large";
+    const resultExact = await executeSELECTQuery(queryExact);
+    const exactResult = resultExact[0]['COUNT(*)'];
+
+    const endTimeExact = performance.now();
+    const endMemoryUsageExact = process.memoryUsage().heapUsed;
+
+    console.log(`Execution Time for Exact Count: ${(endTimeExact - startTimeExact).toFixed(2)} ms`);
+    console.log(`Start Memory for Exact Count: ${startMemoryUsageExact / 1024 / 1024} MB`);
+    console.log(`End Memory for Exact Count: ${endMemoryUsageExact / 1024 / 1024} MB`);
+    console.log(`Memory Used for Exact Count: ${(endMemoryUsageExact - startMemoryUsageExact) / 1024 / 1024} MB`);
+
+    const startMemoryUsage = process.memoryUsage().heapUsed;
+    const startTime = performance.now();
+
+    const query = "SELECT APPROXIMATE_COUNT(*) FROM student_large";
+    const result = await executeSELECTQuery(query);
+
+    // Expect the approximate count to be within 5% of the actual count
+    expect(result[0]['APPROXIMATE_COUNT(*)']).toBeGreaterThan(exactResult - 0.05 * exactResult);
+    expect(result[0]['APPROXIMATE_COUNT(*)']).toBeLessThan(exactResult + 0.05 * exactResult);
+
+    const endTime = performance.now();
+    const endMemoryUsage = process.memoryUsage().heapUsed;
+
+    console.log(`Execution Time for Approximate Count: ${(endTime - startTime).toFixed(2)} ms`);
+    console.log(`Start Memory: ${startMemoryUsage / 1024 / 1024} MB`);
+    console.log(`End Memory: ${endMemoryUsage / 1024 / 1024} MB`);
+    console.log(`Memory Used for Approximate Count: ${(endMemoryUsage - startMemoryUsage) / 1024 / 1024} MB`);
+
+}, 120000);
+
+test('Execute SQL Query with COUNT with DISTINCT on a column', async () => {
+    const queryExact = "SELECT COUNT(DISTINCT (name, age)) FROM student_large";
+    const resultExact = await executeSELECTQuery(queryExact);
+    console.log({ resultExact });
+    const exactResult = resultExact[0]['COUNT(DISTINCT (name, age))'];
+
+    const query = "SELECT APPROXIMATE_COUNT(DISTINCT (name, age)) FROM student_large";
+    const result = await executeSELECTQuery(query);
+
+    // Expect the approximate count to be within 2% of the actual count
+    expect(result[0]['APPROXIMATE_COUNT(DISTINCT (name, age))']).toBeGreaterThan(exactResult - 0.05 * exactResult);
+    expect(result[0]['APPROXIMATE_COUNT(DISTINCT (name, age))']).toBeLessThan(exactResult + 0.05 * exactResult);
+}, 120000);
+```
+
+### 21.6 Update the tests for other files to test for the updates you made in other parts of the implementation
+
+Since we have made changes to the other parts of the implementation such as the `csvStorage.js`, `queryParser.js` and `queryExecutor.js` we need to update the tests for those files to test for the functionality.