From 2d498f7acb17afc623a42ad66d041d64af17616b Mon Sep 17 00:00:00 2001 From: Ismail Pelaseyed Date: Mon, 2 Sep 2024 15:12:08 +0200 Subject: [PATCH 1/2] Update google-drive sync template to support more file formats --- .../google-drive/syncs/documents.ts | 99 ++++++++++++------- 1 file changed, 66 insertions(+), 33 deletions(-) diff --git a/integration-templates/google-drive/syncs/documents.ts b/integration-templates/google-drive/syncs/documents.ts index 64d36116c4..a92d36d47d 100644 --- a/integration-templates/google-drive/syncs/documents.ts +++ b/integration-templates/google-drive/syncs/documents.ts @@ -1,4 +1,4 @@ -import type { NangoSync, Document } from '../../models'; +import type { NangoSync, Document } from "../../models"; interface GoogleDriveFileResponse { id: string; @@ -12,17 +12,31 @@ interface Metadata { folders?: string[]; } -const mimeTypeMapping: Record = { - 'application/vnd.google-apps.document': 'text/plain', - 'application/vnd.google-apps.spreadsheet': 'text/csv', - 'application/vnd.google-apps.presentation': 'application/vnd.openxmlformats-officedocument.presentationml.presentation' +const structuredMimeTypeMapping: Record = { + "application/vnd.google-apps.spreadsheet": "text/csv", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": + "text/csv", + "text/csv": "text/csv", }; +const whiteListedMimeTypes: Set = new Set([ + "application/vnd.google-apps.spreadsheet", + "application/vnd.google-apps.document", + "application/vnd.google-apps.presentation", + "application/pdf", + "text/plain", + "text/markdown", + "text/csv", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", +]); + export default async function fetchData(nango: NangoSync): Promise { const metadata = await nango.getMetadata(); if (!metadata || (!metadata.files && !metadata.folders)) { - throw new Error('Metadata for files or folders is required.'); + throw new Error("Metadata for files or folders is required."); } const initialFolders = metadata?.folders ? [...metadata.folders] : []; @@ -38,30 +52,37 @@ export default async function fetchData(nango: NangoSync): Promise { const proxyConfiguration = { endpoint: `drive/v3/files`, params: { - fields: 'files(id, name, mimeType, webViewLink, parents), nextPageToken', + fields: + "files(id, name, mimeType, webViewLink, parents), nextPageToken", pageSize: batchSize.toString(), - q: query + q: query, }, paginate: { - response_path: 'files' - } + response_path: "files", + }, }; - for await (const files of nango.paginate(proxyConfiguration)) { + for await (const files of nango.paginate( + proxyConfiguration + )) { for (const file of files) { - if (file.mimeType === 'application/vnd.google-apps.folder') { + if (file.mimeType === "application/vnd.google-apps.folder") { await processFolder(file.id); - } else if (file.mimeType === 'application/vnd.google-apps.document' || file.mimeType === 'application/pdf') { - const content = await fetchDocumentContent(nango, file, file.mimeType); + } else if (whiteListedMimeTypes.has(file.mimeType)) { + const content = await fetchDocumentContent( + nango, + file, + file.mimeType + ); batch.push({ id: file.id, url: file.webViewLink, - content: content || '', - title: file.name + content: content || "", + title: file.name, }); if (batch.length === batchSize) { - await nango.batchSave(batch, 'Document'); + await nango.batchSave(batch, "Document"); batch = []; } } @@ -79,20 +100,31 @@ export default async function fetchData(nango: NangoSync): Promise { const documentResponse = await nango.get({ endpoint: `drive/v3/files/${file}`, params: { - fields: 'id, name, mimeType, webViewLink, parents' - } + fields: "id, name, mimeType, webViewLink, parents", + }, }); - const content = await fetchDocumentContent(nango, documentResponse.data, documentResponse.data.mimeType); + + if (!whiteListedMimeTypes.has(documentResponse.data.mimeType)) { + await nango.log( + `Skipping file ${file} due to unsupported mime type: ${documentResponse.data.mimeType}` + ); + continue; + } + const content = await fetchDocumentContent( + nango, + documentResponse.data, + documentResponse.data.mimeType + ); batch.push({ id: documentResponse.data.id, url: documentResponse.data.webViewLink, - content: content || '', - title: documentResponse.data.name + content: content || "", + title: documentResponse.data.name, }); if (batch.length === batchSize) { - await nango.batchSave(batch, 'Document'); + await nango.batchSave(batch, "Document"); batch = []; } } catch (e) { @@ -102,30 +134,31 @@ export default async function fetchData(nango: NangoSync): Promise { } if (batch.length > 0) { - await nango.batchSave(batch, 'Document'); + await nango.batchSave(batch, "Document"); } } -async function fetchDocumentContent(nango: NangoSync, doc: GoogleDriveFileResponse, mimeType: string): Promise { +async function fetchDocumentContent( + nango: NangoSync, + doc: GoogleDriveFileResponse, + mimeType: string +): Promise { try { - if (mimeType === 'application/vnd.google-apps.spreadsheet') { + if (structuredMimeTypeMapping[mimeType]) { const contentResponse = await nango.get({ endpoint: `drive/v3/files/${doc.id}/export`, params: { - mimeType: 'text/csv' + mimeType: "text/csv", }, - responseType: 'text' + responseType: "text", }); return contentResponse.data; - } else if (mimeType === 'application/pdf') { - return ''; } else { - const exportType = mimeTypeMapping[mimeType] || 'text/plain'; const contentResponse = await nango.get({ endpoint: `drive/v3/files/${doc.id}/export`, params: { - mimeType: exportType - } + mimeType: "text/plain", + }, }); return contentResponse.data; From 18cf60e74d6a830e735e65d499cfbbfabab4577e Mon Sep 17 00:00:00 2001 From: Ismail Pelaseyed Date: Mon, 9 Sep 2024 08:12:00 +0200 Subject: [PATCH 2/2] Update google drive sync script to support more file formats --- .../google-drive/syncs/documents.ts | 206 +++++++++--------- 1 file changed, 99 insertions(+), 107 deletions(-) diff --git a/integration-templates/google-drive/syncs/documents.ts b/integration-templates/google-drive/syncs/documents.ts index a92d36d47d..beca3d8f36 100644 --- a/integration-templates/google-drive/syncs/documents.ts +++ b/integration-templates/google-drive/syncs/documents.ts @@ -1,4 +1,4 @@ -import type { NangoSync, Document } from "../../models"; +import type { NangoSync, Document } from '../../models'; interface GoogleDriveFileResponse { id: string; @@ -6,165 +6,157 @@ interface GoogleDriveFileResponse { mimeType: string; webViewLink: string; } - interface Metadata { files?: string[]; folders?: string[]; } - -const structuredMimeTypeMapping: Record = { - "application/vnd.google-apps.spreadsheet": "text/csv", - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": - "text/csv", - "text/csv": "text/csv", +// Mapping MIME types to their export MIME type and response type +const mimeTypeMapping: Record = { + // Documents + 'application/vnd.google-apps.document': { mimeType: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', responseType: 'text' }, + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': { + mimeType: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + responseType: 'stream' + }, + 'application/vnd.oasis.opendocument.text': { mimeType: 'application/vnd.oasis.opendocument.text', responseType: 'stream' }, + 'application/rtf': { mimeType: 'application/rtf', responseType: 'stream' }, + 'text/plain': { mimeType: 'text/plain', responseType: 'text' }, + // Spreadsheets + 'application/vnd.google-apps.spreadsheet': { mimeType: 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', responseType: 'text' }, + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': { + mimeType: 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + responseType: 'stream' + }, + 'application/vnd.oasis.opendocument.spreadsheet': { mimeType: 'application/vnd.oasis.opendocument.spreadsheet', responseType: 'stream' }, + // PDFs + 'application/pdf': { mimeType: 'application/pdf', responseType: 'stream' }, + // Text Files + 'text/csv': { mimeType: 'text/csv', responseType: 'text' }, + 'text/tab-separated-values': { mimeType: 'text/tab-separated-values', responseType: 'text' }, + // Presentations + 'application/vnd.google-apps.presentation': { mimeType: 'application/vnd.openxmlformats-officedocument.presentationml.presentation', responseType: 'text' }, + 'application/vnd.openxmlformats-officedocument.presentationml.presentation': { + mimeType: 'application/vnd.openxmlformats-officedocument.presentationml.presentation', + responseType: 'stream' + }, + 'application/vnd.oasis.opendocument.presentation': { mimeType: 'application/vnd.oasis.opendocument.presentation', responseType: 'stream' }, + // Drawings and Images + 'application/vnd.google-apps.drawing': { mimeType: 'image/jpeg', responseType: 'stream' }, + 'image/jpeg': { mimeType: 'image/jpeg', responseType: 'stream' }, + 'image/png': { mimeType: 'image/png', responseType: 'stream' }, + 'image/svg+xml': { mimeType: 'image/svg+xml', responseType: 'stream' } }; - -const whiteListedMimeTypes: Set = new Set([ - "application/vnd.google-apps.spreadsheet", - "application/vnd.google-apps.document", - "application/vnd.google-apps.presentation", - "application/pdf", - "text/plain", - "text/markdown", - "text/csv", - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - "application/vnd.openxmlformats-officedocument.presentationml.presentation", -]); - export default async function fetchData(nango: NangoSync): Promise { const metadata = await nango.getMetadata(); - if (!metadata || (!metadata.files && !metadata.folders)) { - throw new Error("Metadata for files or folders is required."); + throw new Error('Metadata for files or folders is required.'); } - const initialFolders = metadata?.folders ? [...metadata.folders] : []; const processedFolders = new Set(); const batchSize = 100; let batch: Document[] = []; - + // Recursive function to process files in a folder async function processFolder(folderId: string) { if (processedFolders.has(folderId)) return; processedFolders.add(folderId); - const query = `('${folderId}' in parents) and trashed = false`; const proxyConfiguration = { endpoint: `drive/v3/files`, params: { - fields: - "files(id, name, mimeType, webViewLink, parents), nextPageToken", + fields: 'files(id, name, mimeType, webViewLink, parents), nextPageToken', pageSize: batchSize.toString(), - q: query, + q: query }, paginate: { - response_path: "files", - }, + response_path: 'files' + } }; - - for await (const files of nango.paginate( - proxyConfiguration - )) { + for await (const files of nango.paginate(proxyConfiguration)) { for (const file of files) { - if (file.mimeType === "application/vnd.google-apps.folder") { + if (file.mimeType === 'application/vnd.google-apps.folder') { await processFolder(file.id); - } else if (whiteListedMimeTypes.has(file.mimeType)) { - const content = await fetchDocumentContent( - nango, - file, - file.mimeType - ); - batch.push({ - id: file.id, - url: file.webViewLink, - content: content || "", - title: file.name, - }); - - if (batch.length === batchSize) { - await nango.batchSave(batch, "Document"); - batch = []; - } + } else { + await processFile(file); } } } } - + // Function to process individual files + async function processFile(file: GoogleDriveFileResponse) { + const content = await fetchDocumentContent(nango, file); + batch.push({ + id: file.id, + url: file.webViewLink, + content: content ?? '', + title: file.name + }); + if (batch.length === batchSize) { + await nango.batchSave(batch, 'Document'); + batch = []; + } + } + // Process initial folders for (const folderId of initialFolders) { await processFolder(folderId); } - + // Process individual files if (metadata?.files) { for (const file of metadata.files) { try { const documentResponse = await nango.get({ endpoint: `drive/v3/files/${file}`, params: { - fields: "id, name, mimeType, webViewLink, parents", - }, - }); - - if (!whiteListedMimeTypes.has(documentResponse.data.mimeType)) { - await nango.log( - `Skipping file ${file} due to unsupported mime type: ${documentResponse.data.mimeType}` - ); - continue; - } - const content = await fetchDocumentContent( - nango, - documentResponse.data, - documentResponse.data.mimeType - ); - - batch.push({ - id: documentResponse.data.id, - url: documentResponse.data.webViewLink, - content: content || "", - title: documentResponse.data.name, + fields: 'id, name, mimeType, webViewLink, parents' + } }); - - if (batch.length === batchSize) { - await nango.batchSave(batch, "Document"); - batch = []; - } + await processFile(documentResponse.data); } catch (e) { await nango.log(`Error fetching file ${file}: ${e}`); } } } - + // Save remaining batch if (batch.length > 0) { - await nango.batchSave(batch, "Document"); + await nango.batchSave(batch, 'Document'); } } -async function fetchDocumentContent( - nango: NangoSync, - doc: GoogleDriveFileResponse, - mimeType: string -): Promise { +async function fetchDocumentContent(nango: NangoSync, doc: GoogleDriveFileResponse): Promise { try { - if (structuredMimeTypeMapping[mimeType]) { - const contentResponse = await nango.get({ - endpoint: `drive/v3/files/${doc.id}/export`, - params: { - mimeType: "text/csv", - }, - responseType: "text", - }); - return contentResponse.data; - } else { - const contentResponse = await nango.get({ - endpoint: `drive/v3/files/${doc.id}/export`, - params: { - mimeType: "text/plain", - }, - }); - - return contentResponse.data; + const mapping = mimeTypeMapping[doc.mimeType]; + if (!mapping) { + await nango.log(`Unsupported MIME type for content extraction: ${doc.mimeType}`); + return null; } + const { responseType, mimeType: exportMimeType } = mapping; + return await fetchFileContent(nango, doc.id, exportMimeType, responseType); } catch (e) { await nango.log(`Error fetching content for ${doc.name}: ${e}`); return null; } } + +async function fetchFileContent(nango: NangoSync, fileId: string, mimeType: string, responseType: 'text' | 'stream'): Promise { + try { + const endpoint = responseType === 'text' ? `drive/v3/files/${fileId}/export` : `drive/v3/files/${fileId}`; + const params = responseType === 'text' ? { mimeType } : { alt: 'media' }; + const response = await nango.get({ + endpoint, + params, + responseType + }); + if (responseType === 'text') { + return response.data ?? null; + } else { + const chunks: Buffer[] = []; + for await (const chunk of response.data) { + chunks.push(chunk); + } + const buffer = Buffer.concat(chunks); + return buffer.toString('base64'); + } + } catch (e) { + await nango.log(`Error fetching content for file ${fileId}: ${e}`); + return null; + } +}