Skip to content

Commit

Permalink
compile quotes into topic clusters
Browse files Browse the repository at this point in the history
  • Loading branch information
bramses committed Mar 24, 2024
1 parent 79c61f4 commit 41ca718
Show file tree
Hide file tree
Showing 6 changed files with 671 additions and 1 deletion.
Binary file modified .DS_Store
Binary file not shown.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,6 @@ node_modules
data*.json
amazon_links.json
records*.json
books-notion-table.csv
books-notion-table.csv
kindle-highlights
kindle-export-supabase.js
Empty file added book-covers-db.js
Empty file.
183 changes: 183 additions & 0 deletions compile-quotes.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
/*
compile all quotes from supabase where BookID = X.
Then use GPT to cluster them into a few categories.
Then create a easy to read story about highlights.
*/

import { createClient } from "@supabase/supabase-js";
import dotenv from "dotenv";
import fs from "fs";
import similarity from "compute-cosine-similarity";
import { Configuration, OpenAIApi } from "openai";

dotenv.config();

const configuration = new Configuration({
apiKey: process.env.OPENAI_API_KEY,
organization: process.env.OPENAI_ORG,
});

const openai = new OpenAIApi(configuration);

dotenv.config();

const CLUSTER_THRESHOLD = 0.8;
const BOOK_ID = "33431849";

const supabaseUrl = process.env.SUPABASE_URL;
const supabaseKey = process.env.SUPABASE_KEY;
const supabase = createClient(supabaseUrl, supabaseKey, {
auth: {
persistSession: false,
},
});

const getQuotes = async (bookId) => {
const { data, error } = await supabase
.from("highlights")
.select("*")
.eq("book_id", bookId);

if (error) {
console.error(error);
return [];
}

return data;
};

const getBookIDFromTitle = async (title) => {
const { data, error } = await supabase
.from("books")
.select("book_id")
.ilike("title", "%" + title + "%");

if (error) {
console.error(error);
return null;
}

// if undefined, return null
if (data.length === 0) {
return null;
}

return data[0].book_id;
};

const compileQuotesFromTitle = async (bookTitle) => {
const bookId = await getBookIDFromTitle(bookTitle);
if (!bookId) {
console.error("Book not found.");
return [];
}

const quotes = await getQuotes(bookId);
return quotes;
};

const compileQuotesFomID = async (bookID) => {
const quotes = await getQuotes(bookID);
return quotes;
};

function clusterEmbeddings(quotes, threshold = 0.01) {
let clusters = [];
let clusterIndex = 0;
let similarityAvg = 0;
let total = 0;

quotes.forEach((quote, index) => {
if (quote.cluster === undefined) {
quote.cluster = clusterIndex;
clusters[clusterIndex] = [quote];

quotes.forEach((otherQuote, otherIndex) => {
const quoteEmbedding = JSON.parse(quote.embedding);
const otherQuoteEmbedding = JSON.parse(otherQuote.embedding);

if (index === otherIndex) return;
const _similarity = similarity(quoteEmbedding, otherQuoteEmbedding);
if (otherQuote.cluster === undefined) {
if (_similarity > threshold) {
otherQuote.cluster = clusterIndex;
clusters[clusterIndex].push(otherQuote);
}
}

total++;
similarityAvg += _similarity;
});

clusterIndex++;
}
});

console.log("Average similarity: " + similarityAvg / total);

return clusters;
}

const assignTopicToCluster = async (cluster) => {
try {
const prompt = `Given the following quotes, what is a good topic for them? Return only the topic as a Markdown heading with no leading #`;

const completion = await openai.createChatCompletion({
messages: [
{
role: "system",
content: prompt,
},
{
role: "user",
content: cluster.map((quote) => quote.text).join("\n"),
},
],
model: "gpt-3.5-turbo",
});

const content = completion.data.choices[0].message.content;

return content.trim().replace(/#/, "");
} catch (err) {
console.log("START ERROR");
console.error(err);
console.error(err.response);
console.error(err.response.data);
console.error(err.response.data.error);
console.error(err.response.data.error.message);
console.error(err.response.data.error.code);
console.error(err.response.data.error.status);
console.error(err.response.data.error.request);
console.log("END ERROR");
throw err;
}
};

const main = async () => {
const quotes = await compileQuotesFomID(BOOK_ID);

console.log(quotes.length + " quotes found.");

let clusteredQuotes = clusterEmbeddings(quotes, CLUSTER_THRESHOLD);

// convert each cluster into a heading and a list of quotes in markdown under it and write to a file
// use cluster index as heading
// each quote is a bullet point under the heading

let markdown = "";
for (let index = 0; index < clusteredQuotes.length; index++) {
const cluster = clusteredQuotes[index];
const topic = await assignTopicToCluster(cluster);
console.log("Cluster " + index + ": " + topic);
markdown += `## Cluster ${index} - ${topic}\n`;
cluster.forEach((quote) => {
markdown += `- ${quote.text}\n`;
});
}


fs.writeFileSync("quotes.md", markdown);
};

main();
Loading

0 comments on commit 41ca718

Please sign in to comment.