diff --git a/jupyter_drives/handlers.py b/jupyter_drives/handlers.py index 5fb7d2b..b01847f 100644 --- a/jupyter_drives/handlers.py +++ b/jupyter_drives/handlers.py @@ -69,17 +69,17 @@ def initialize(self, logger: logging.Logger, manager: JupyterDrivesManager): return super().initialize(logger, manager) @tornado.web.authenticated - async def get(self, path: str = "", drive: str = ""): + async def get(self, drive: str = "", path: str = ""): result = await self._manager.get_contents(drive, path) self.finish(result) @tornado.web.authenticated - async def post(self, path: str = "", drive: str = ""): + async def post(self, drive: str = "", path: str = ""): result = await self._manager.new_file(drive, path) self.finish(result) @tornado.web.authenticated - async def patch(self, path: str = "", drive: str = ""): + async def patch(self, drive: str = "", path: str = ""): body = self.get_json_body() result = await self._manager.rename_file(drive, path, **body) self.finish(result) diff --git a/jupyter_drives/manager.py b/jupyter_drives/manager.py index 3e39d94..df0f946 100644 --- a/jupyter_drives/manager.py +++ b/jupyter_drives/manager.py @@ -3,14 +3,17 @@ import logging from typing import Dict, List, Optional, Tuple, Union, Any +import os import tornado import httpx import traitlets +import base64 from jupyter_server.utils import url_path_join import obstore as obs from libcloud.storage.types import Provider from libcloud.storage.providers import get_driver +import pyarrow from .log import get_logger from .base import DrivesConfig @@ -86,7 +89,7 @@ async def list_drives(self): "name": result.name, "region": self._config.region_name if self._config.region_name is not None else "eu-north-1", "creation_date": result.extra["creation_date"], - "mounted": "true" if result.name not in self._content_managers else "false", + "mounted": False if result.name not in self._content_managers else True, "provider": self._config.provider } ) @@ -153,14 +156,86 @@ async def unmount_drive(self, drive_name: str): return - async def get_contents(self, drive_name, path, **kwargs): + async def get_contents(self, drive_name, path): """Get contents of a file or directory. Args: drive_name: name of drive to get the contents of - path: path to file or directory + path: path to file or directory (empty string for root listing) """ - print('Get contents function called.') + if path == '/': + path = '' + try : + data = [] + isDir = False + emptyDir = True # assume we are dealing with an empty directory + + # using Arrow lists as they are recommended for large results + # stream will be an async iterable of RecordBatch + stream = obs.list(self._content_managers[drive_name], path, chunk_size=100, return_arrow=True) + async for batch in stream: + # if content exists we are dealing with a directory + if isDir is False and batch: + isDir = True + emptyDir = False + + contents_list = pyarrow.record_batch(batch).to_pylist() + for object in contents_list: + data.append({ + "path": object["path"], + "last_modified": object["last_modified"].isoformat(), + "size": object["size"], + }) + + # check if we are dealing with an empty drive + if isDir is False and path != '': + content = b"" + # retrieve contents of object + obj = await obs.get_async(self._content_managers[drive_name], path) + stream = obj.stream(min_chunk_size=5 * 1024 * 1024) # 5MB sized chunks + async for buf in stream: + # if content exists we are dealing with a file + if emptyDir is True and buf: + emptyDir = False + content += buf + + # retrieve metadata of object + metadata = await obs.head_async(self._content_managers[drive_name], path) + + # for certain media type files, extracted content needs to be read as a byte array and decoded to base64 to be viewable in JupyterLab + # the following extensions correspond to a base64 file format or are of type PDF + ext = os.path.splitext(path)[1] + if ext == '.pdf' or ext == '.svg' or ext == '.tif' or ext == '.tiff' or ext == '.jpg' or ext == '.jpeg' or ext == '.gif' or ext == '.png' or ext == '.bmp' or ext == '.webp': + processed_content = base64.b64encode(content).decode("utf-8") + else: + processed_content = content.decode("utf-8") + + data = { + "path": path, + "content": processed_content, + "last_modified": metadata["last_modified"].isoformat(), + "size": metadata["size"] + } + + # dealing with the case of an empty directory, making sure it is not an empty file + # TO DO: find better way to check + if emptyDir is True: + ext_list = ['.R', '.bmp', '.csv', '.gif', '.html', '.ipynb', '.jl', '.jpeg', '.jpg', '.json', '.jsonl', '.md', '.ndjson', '.pdf', '.png', '.py', '.svg', '.tif', '.tiff', '.tsv', '.txt', '.webp', '.yaml', '.yml'] + object_name = os.path.basename(path) + # if object doesn't contain . or doesn't end in one of the registered extensions + if object_name.find('.') == -1 or ext_list.count(os.path.splitext(object_name)[1]) == 0: + data = [] + + response = { + "data": data + } + except Exception as e: + raise tornado.web.HTTPError( + status_code= httpx.codes.BAD_REQUEST, + reason=f"The following error occured when retrieving the contents: {e}", + ) + + return response async def new_file(self, drive_name, path, **kwargs): """Create a new file or directory at the given path. diff --git a/pyproject.toml b/pyproject.toml index a487a72..06c08e3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,7 @@ classifiers = [ ] dependencies = [ "obstore>=0.2.0,<0.3", + "pyarrow>=18.0.0,<19.0.0", "jupyter_server>=2.14.2,<3", "s3contents>=0.11.1,<0.12.0", "apache-libcloud>=3.8.0, <4", diff --git a/src/contents.ts b/src/contents.ts index 470d824..3c52da0 100644 --- a/src/contents.ts +++ b/src/contents.ts @@ -1,11 +1,8 @@ -// Copyright (c) Jupyter Development Team. -// Distributed under the terms of the Modified BSD License. - +import { JupyterFrontEnd } from '@jupyterlab/application'; import { Signal, ISignal } from '@lumino/signaling'; import { Contents, ServerConnection } from '@jupyterlab/services'; -import { PathExt } from '@jupyterlab/coreutils'; -import { IDriveInfo } from './token'; -import { mountDrive } from './requests'; +import { IDriveInfo, IRegisteredFileTypes } from './token'; +import { getContents, mountDrive } from './requests'; let data: Contents.IModel = { name: '', @@ -120,6 +117,20 @@ export class Drive implements Contents.IDrive { return this._serverSettings; } + /** + * The registered file types + */ + get registeredFileTypes(): IRegisteredFileTypes { + return this._registeredFileTypes; + } + + /** + * The registered file types + */ + set registeredFileTypes(fileTypes: IRegisteredFileTypes) { + this._registeredFileTypes = fileTypes; + } + /** * A signal emitted when a file operation takes place. */ @@ -185,40 +196,41 @@ export class Drive implements Contents.IDrive { ): Promise { let relativePath = ''; if (localPath !== '') { - if (localPath.includes(this.name)) { - relativePath = localPath.split(this.name + '/')[1]; - } else { - relativePath = localPath; - } - // extract current drive name - const currentDrive = this.drivesList.filter(x => x.name === localPath)[0]; + const currentDrive = this._drivesList.filter( + x => + x.name === + (localPath.indexOf('/') !== -1 + ? localPath.substring(0, localPath.indexOf('/')) + : localPath) + )[0]; + // when accessed the first time, mount drive - if (!currentDrive.mounted) { + if (currentDrive.mounted === false) { try { await mountDrive(localPath, { provider: currentDrive.provider, region: currentDrive.region }); - currentDrive.mounted = true; + this._drivesList.filter(x => x.name === localPath)[0].mounted = true; } catch (e) { console.log(e); } } - data = { - name: PathExt.basename(localPath), - path: PathExt.basename(localPath), - last_modified: '', - created: '', - content: [], - format: 'json', - mimetype: '', - size: undefined, - writable: true, - type: 'directory' - }; + // eliminate drive name from path + relativePath = + localPath.indexOf('/') !== -1 + ? localPath.substring(localPath.indexOf('/') + 1) + : ''; + + data = await getContents(currentDrive.name, { + path: relativePath, + registeredFileTypes: this._registeredFileTypes + }); } else { + // retriving list of contents from root + // in our case: list available drives const drivesList: Contents.IModel[] = []; for (const drive of this._drivesList) { drivesList.push({ @@ -248,7 +260,6 @@ export class Drive implements Contents.IDrive { type: 'directory' }; } - console.log('GET: ', relativePath); Contents.validateContentsModel(data); return data; @@ -558,7 +569,11 @@ export class Drive implements Contents.IDrive { * checkpoint is created. */ createCheckpoint(path: string): Promise { - return Promise.reject('Repository is read only'); + const emptyCheckpoint: Contents.ICheckpointModel = { + id: '', + last_modified: '' + }; + return Promise.resolve(emptyCheckpoint); } /** @@ -599,6 +614,40 @@ export class Drive implements Contents.IDrive { return Promise.reject('Read only'); } + /** + * Get all registered file types and store them accordingly with their file + * extension (e.g.: .txt, .pdf, .jpeg), file mimetype (e.g.: text/plain, application/pdf) + * and file format (e.g.: base64, text). + * + * @param app + */ + getRegisteredFileTypes(app: JupyterFrontEnd) { + // get called when instating the toolbar + const registeredFileTypes = app.docRegistry.fileTypes(); + + for (const fileType of registeredFileTypes) { + // check if we are dealing with a directory + if (fileType.extensions.length === 0) { + this._registeredFileTypes[''] = { + fileType: 'directory', + fileFormat: 'json', + fileMimeTypes: ['text/directory'] + }; + } + + // store the mimetype and fileformat for each file extension + fileType.extensions.forEach(extension => { + if (!this._registeredFileTypes[extension]) { + this._registeredFileTypes[extension] = { + fileType: fileType.name, + fileMimeTypes: [...fileType.mimeTypes], + fileFormat: fileType.fileFormat ?? '' + }; + } + }); + } + } + /** * Get a REST url for a file given a path. */ @@ -619,6 +668,7 @@ export class Drive implements Contents.IDrive { private _fileChanged = new Signal(this); private _isDisposed: boolean = false; private _disposed = new Signal(this); + private _registeredFileTypes: IRegisteredFileTypes = {}; } export namespace Drive { diff --git a/src/index.ts b/src/index.ts index 14dcc3c..684053e 100644 --- a/src/index.ts +++ b/src/index.ts @@ -172,8 +172,8 @@ const drivesListProvider: JupyterFrontEndPlugin = { mounted: drive.mounted }); } - } catch { - console.log('Failed loading available drives list.'); + } catch (error) { + console.log('Failed loading available drives list, with error: ', error); } return drives; } @@ -224,6 +224,9 @@ const driveFileBrowser: JupyterFrontEndPlugin = { app.serviceManager.contents.addDrive(drive); + // get registered file types + drive.getRegisteredFileTypes(app); + // Manually restore and load the drive file browser. const driveBrowser = fileBrowserFactory.createFileBrowser('drivebrowser', { auto: false, diff --git a/src/requests.ts b/src/requests.ts index d6ac6e3..2edb2c7 100644 --- a/src/requests.ts +++ b/src/requests.ts @@ -1,9 +1,29 @@ import { ReadonlyJSONObject } from '@lumino/coreutils'; +import { Contents } from '@jupyterlab/services'; +import { PathExt } from '@jupyterlab/coreutils'; + import { requestAPI } from './handler'; +import { getFileType, IRegisteredFileTypes, IContentsList } from './token'; + +/** + * The data contents model. + */ +let data: Contents.IModel = { + name: '', + path: '', + last_modified: '', + created: '', + content: null, + format: null, + mimetype: '', + size: 0, + writable: true, + type: '' +}; /** * Fetch the list of available drives. - * @returns list of drives + * @returns The list of available drives. */ export async function getDrivesList() { return await requestAPI('drives', 'GET'); @@ -12,6 +32,8 @@ export async function getDrivesList() { /** * Mount a drive by establishing a connection with it. * @param driveName + * @param options.provider The provider of the drive to be mounted. + * @param options.region The region of the drive to be mounted. */ export async function mountDrive( driveName: string, @@ -24,3 +46,94 @@ export async function mountDrive( }; return await requestAPI('drives', 'POST', body); } + +/** + * Get contents of a directory or retrieve contents of a specific file. + * + * @param driveName + * @param options.path The path of object to be retrived + * @param options.path The list containing all registered file types. + * + * @returns A promise which resolves with the contents model. + */ +export async function getContents( + driveName: string, + options: { path: string; registeredFileTypes: IRegisteredFileTypes } +) { + const response = await requestAPI( + 'drives/' + driveName + '/' + options.path, + 'GET' + ); + // checking if we are dealing with a directory or a file + const isDir: boolean = response.data.length !== undefined; + + if (response.data) { + // listing the contents of a directory + if (isDir) { + const fileList: IContentsList = {}; + + response.data.forEach((row: any) => { + // check if we are dealing with files inside a subfolder + if (row.path !== options.path && row.path !== options.path + '/') { + // extract object name from path + const fileName = row.path + .replace(options.path ? options.path + '/' : '', '') + .split('/')[0]; + + const [fileType, fileMimeType, fileFormat] = getFileType( + PathExt.extname(PathExt.basename(fileName)), + options.registeredFileTypes + ); + + fileList[fileName] = fileList[fileName] ?? { + name: fileName, + path: driveName + '/' + row.path, + last_modified: row.last_modified, + created: '', + content: !fileName.split('.')[1] ? [] : null, + format: fileFormat as Contents.FileFormat, + mimetype: fileMimeType, + size: row.size, + writable: true, + type: fileType + }; + } + }); + + data = { + name: options.path ? PathExt.basename(options.path) : '', + path: options.path ? options.path + '/' : '', + last_modified: '', + created: '', + content: Object.values(fileList), + format: 'json', + mimetype: '', + size: undefined, + writable: true, + type: 'directory' + }; + } + // getting the contents of a file + else { + const [fileType, fileMimeType, fileFormat] = getFileType( + PathExt.extname(PathExt.basename(options.path)), + options.registeredFileTypes + ); + + data = { + name: PathExt.basename(options.path), + path: driveName + '/' + response.data.path, + last_modified: response.data.last_modified, + created: '', + content: response.data.content, + format: fileFormat as Contents.FileFormat, + mimetype: fileMimeType, + size: response.data.size, + writable: true, + type: fileType + }; + } + } + + return data; +} diff --git a/src/token.ts b/src/token.ts index e54c5d6..9f0137a 100644 --- a/src/token.ts +++ b/src/token.ts @@ -1,4 +1,5 @@ import { Token } from '@lumino/coreutils'; +import { Contents } from '@jupyterlab/services'; /** * A token for the plugin that provides the list of drives. @@ -32,3 +33,48 @@ export interface IDriveInfo { */ mounted: boolean; } + +/** + * An interface for storing the contents of a directory. + */ +export interface IContentsList { + [fileName: string]: Contents.IModel; +} + +/** + * An interface that stores the registered file type, mimetype and format for each file extension. + */ +export interface IRegisteredFileTypes { + [fileExtension: string]: { + fileType: string; + fileMimeTypes: string[]; + fileFormat: string; + }; +} + +/** + * Helping function to define file type, mimetype and format based on file extension. + * @param extension file extension (e.g.: txt, ipynb, csv) + * @returns + */ +export function getFileType( + extension: string, + registeredFileTypes: IRegisteredFileTypes +) { + let fileType: string = 'text'; + let fileMimetype: string = 'text/plain'; + let fileFormat: string = 'text'; + + if (registeredFileTypes[extension]) { + fileType = registeredFileTypes[extension].fileType; + fileMimetype = registeredFileTypes[extension].fileMimeTypes[0]; + fileFormat = registeredFileTypes[extension].fileFormat; + } + + // the file format for notebooks appears as json, but should be text + if (extension === '.ipynb') { + fileFormat = 'text'; + } + + return [fileType, fileMimetype, fileFormat]; +}