Skip to content

Commit

Permalink
Input data fixes (#200)
Browse files Browse the repository at this point in the history
var fixes with input data stream etc
  • Loading branch information
niklub authored Jan 29, 2020
1 parent eb0ac6d commit 9a3796c
Show file tree
Hide file tree
Showing 10 changed files with 485 additions and 489 deletions.
513 changes: 201 additions & 312 deletions label_studio/project.py

Large diffs are not rendered by default.

170 changes: 18 additions & 152 deletions label_studio/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,11 @@
from label_studio.utils.validation import TaskValidator
from label_studio.utils.exceptions import ValidationError
from label_studio.utils.functions import generate_sample_task_without_check, data_examples
from label_studio.utils.misc import (
exception_treatment, log_config, log, config_line_stripped,
get_config_templates, iter_config_templates
)
from label_studio.utils.misc import exception_treatment, log_config, log, config_line_stripped, get_config_templates
from label_studio.utils.argparser import parse_input_args

from label_studio.project import Project
from label_studio.tasks import Tasks

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -208,6 +208,7 @@ def import_page():
project = project_get_or_create()

project.analytics.send(getframeinfo(currentframe()).function)
project.project_obj.name = project.name
return flask.render_template(
'import.html',
config=project.config,
Expand Down Expand Up @@ -302,7 +303,6 @@ def api_save_config():
return make_response(jsonify({'label_config': [str(e)]}), status.HTTP_400_BAD_REQUEST)

project.update_label_config(label_config)
project.reload()
project.analytics.send(getframeinfo(currentframe()).function)
return Response(status=status.HTTP_201_CREATED)

Expand Down Expand Up @@ -401,30 +401,19 @@ class DjangoRequest:
except ValidationError as e:
return make_response(jsonify(e.msg_to_list()), status.HTTP_400_BAD_REQUEST)

# save task file to input dir
if os.path.isdir(project.config['input_path']):
# tasks are in directory, write a new file with tasks
task_dir = project.config['input_path']
now = datetime.now()
data = json.dumps(new_tasks, ensure_ascii=False)
md5 = hashlib.md5(json.dumps(data).encode('utf-8')).hexdigest()
name = 'import-' + now.strftime('%Y-%m-%d-%H-%M') + '-' + str(md5[0:8])
path = os.path.join(task_dir, name + '.json')
tasks = new_tasks
else:
# tasks are all in one file, append it
path = project.config['input_path']
old_tasks = json.load(open(path))
assert isinstance(old_tasks, list), 'Tasks from input_path must be list'
tasks = old_tasks + new_tasks
logger.error("It's recommended to use directory as input_path: " +
project.config['input_path'] + ' -> ' + os.path.dirname(project.config['input_path']))
# tasks are all in one file, append it
path = project.config['input_path']
old_tasks = json.load(open(path))
max_id_in_old_tasks = max(old_tasks.keys()) if old_tasks else -1
new_tasks = Tasks().from_list_of_dicts(new_tasks, max_id_in_old_tasks + 1)
old_tasks.update(new_tasks)

with open(path, 'w') as f:
json.dump(tasks, f, ensure_ascii=False, indent=4)
json.dump(old_tasks, f, ensure_ascii=False, indent=4)

# load new tasks
project.reload()
# load new tasks and everything related
project.load_tasks()
project.load_derived_schemas()

duration = time.time() - start
return make_response(jsonify({
Expand Down Expand Up @@ -627,138 +616,17 @@ def get_data_file(filename):
return flask.send_from_directory(directory, filename, as_attachment=True)


def parse_input_args():
""" Combine args with json config
:return: config dict
"""
import sys
import argparse

if len(sys.argv) == 1:
print('\nQuick start usage: label-studio start my_project --init\n')

root_parser = argparse.ArgumentParser(add_help=False)
root_parser.add_argument(
'-b', '--no-browser', dest='no_browser', action='store_true',
help='Do not open browser at label studio start'
)
root_parser.add_argument(
'-d', '--debug', dest='debug', action='store_true',
help='Debug mode for Flask', default=None
)
root_parser.add_argument(
'--root-dir', dest='root_dir', default='.',
help='Projects root directory'
)
root_parser.add_argument(
'-v', '--verbose', dest='verbose', action='store_true',
help='Increase output verbosity')

parser = argparse.ArgumentParser(description='Label studio')

subparsers = parser.add_subparsers(dest='command', help='Available commands')
subparsers.required = True

# init sub-command parser

available_templates = [os.path.basename(os.path.dirname(f)) for f in iter_config_templates()]

parser_init = subparsers.add_parser('init', help='Initialize Label Studio', parents=[root_parser])
parser_init.add_argument(
'project_name',
help='Path to directory where project state will be initialized')
parser_init.add_argument(
'--template', dest='template', choices=available_templates,
help='Choose from predefined project templates'
)

# start sub-command parser

parser_start = subparsers.add_parser('start', help='Start Label Studio server', parents=[root_parser])
parser_start.add_argument(
'project_name',
help='Path to directory where project state has been initialized'
)
parser_start.add_argument(
'--init', dest='init', action='store_true',
help='Initialize if project is not initialized yet'
)
parser_start.add_argument(
'--template', dest='template', choices=available_templates,
help='Choose from predefined project templates'
)
parser_start.add_argument(
'-c', '--config', dest='config_path',
help='Server config')
parser_start.add_argument(
'-l', '--label-config', dest='label_config', default='',
help='Label config path')
parser_start.add_argument(
'-i', '--input-path', dest='input_path', default='',
help='Input path to task file or directory with tasks')
parser_start.add_argument(
'-o', '--output-dir', dest='output_dir', default='',
help='Output directory for completions')
parser_start.add_argument(
'-p', '--port', dest='port', default=8200, type=int,
help='Server port')
parser_start.add_argument(
'--ml-backend-url', dest='ml_backend_url',
help='Machine learning backend URL')
parser_start.add_argument(
'--ml-backend-name', dest='ml_backend_name',
help='Machine learning backend name')

# start-multi-session sub-command parser

parser_start_ms = subparsers.add_parser(
'start-multi-session', help='Start Label Studio server', parents=[root_parser])
parser_start_ms.add_argument(
'--template', dest='template', choices=available_templates,
help='Choose from predefined project templates'
)
parser_start_ms.add_argument(
'-c', '--config', dest='config_path',
help='Server config')
parser_start_ms.add_argument(
'-l', '--label-config', dest='label_config', default='',
help='Label config path')
parser_start_ms.add_argument(
'-i', '--input-path', dest='input_path', default='',
help='Input path to task file or directory with tasks')
parser_start_ms.add_argument(
'-o', '--output-dir', dest='output_dir', default='',
help='Output directory for completions')
parser_start_ms.add_argument(
'-p', '--port', dest='port', default=8200, type=int,
help='Server port')
parser_start_ms.add_argument(
'--ml-backend-url', dest='ml_backend_url',
help='Machine learning backend URL')
parser_start_ms.add_argument(
'--ml-backend-name', dest='ml_backend_name',
help='Machine learning backend name')

args = parser.parse_args()
label_config_explicitly_specified = hasattr(args, 'label_config') and args.label_config
if args.template and not label_config_explicitly_specified:
args.label_config = os.path.join(find_dir('examples'), args.template, 'config.xml')
if not hasattr(args, 'label_config'):
args.label_config = None
return args


def main():
import threading
import webbrowser

import label_studio.utils.functions

global input_args

input_args = parse_input_args()

import label_studio.utils.functions
label_studio.utils.functions.HOSTNAME = 'http://localhost:' + str(input_args.port)

# On `init` command, create directory args.project_name with initial project state and exit
if input_args.command == 'init':
Project.create_project_dir(input_args.project_name, input_args)
Expand All @@ -770,8 +638,6 @@ def main():
if input_args.init:
Project.create_project_dir(input_args.project_name, input_args)

label_studio.utils.functions.HOSTNAME = 'http://localhost:' + str(input_args.port)

# On `start` command, launch browser if --no-browser is not specified and start label studio server
if input_args.command == 'start':
if not input_args.no_browser:
Expand Down
103 changes: 103 additions & 0 deletions label_studio/tasks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import orjson
import os
import io
import urllib

from label_studio.utils.io import iter_files


class Tasks(object):

_allowed_extensions = {
'Text': ('.txt',),
'Image': ('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif'),
'Audio': ('.wav', '.aiff', '.mp3', '.au', '.flac')
}

def _create_task_with_local_uri(self, filepath, data_key, task_id):
""" Convert filepath to task with flask serving URL
"""
from label_studio.utils.functions import HOSTNAME

filename = os.path.basename(filepath)
params = urllib.parse.urlencode({'d': os.path.dirname(filepath)})
base_url = HOSTNAME + '/'
image_url_path = base_url + urllib.parse.quote('data/' + filename)
image_local_url = '{image_url_path}?{params}'.format(image_url_path=image_url_path, params=params)
return {
'id': task_id,
'task_path': filepath,
'data': {data_key: image_local_url}
}

def from_dict(self, d, task_id=0):
task = {}
data = d['data'] if 'data' in d else d
task[task_id] = {'id': task_id, 'data': data}
if 'predictions' in data:
task[task_id]['predictions'] = data['predictions']
task[task_id]['data'].pop('predictions', None)
if 'predictions' in d:
task[task_id]['predictions'] = d['predictions']
return task

def from_list_of_dicts(self, l, start_task_id=0):
tasks = {}
for i, t in enumerate(l):
tasks.update(self.from_dict(t, start_task_id + i))
return tasks

def from_json_file(self, path, start_task_id=0):
with open(path) as f:
json_body = orjson.loads(f.read())

# multiple tasks in file
if isinstance(json_body, list):
tasks = {}
task_id = start_task_id
for d in json_body:
tasks.update(self.from_dict(d, task_id))
task_id += 1
return tasks

# one task in file
elif isinstance(json_body, dict):
tasks = self.from_dict(json_body, start_task_id)
return tasks

# unsupported task type
else:
raise Exception('Unsupported task data:', path)

def from_dir_with_json_files(self, path):
tasks = {}
for f in iter_files(path, ext='.json'):
tasks.update(self.from_json_file(f, start_task_id=len(tasks)))
return tasks

def from_text_file(self, path, data_key, start_task_id=0):
tasks = {}
task_id = start_task_id
with io.open(path) as f:
for line in f:
tasks[task_id] = {'id': task_id, 'data': {data_key: line.strip()}}
return tasks

def from_dir_with_text_files(self, path, data_key):
tasks = {}
for f in iter_files(path, ext=''):
tasks.update(self.from_text_file(f, data_key, start_task_id=len(tasks)))
return tasks

def _from_dir_with_local_resources(self, path, data_key, data_type):
tasks = {}
for f in iter_files(path, ext=self._allowed_extensions[data_type]):
task_id = len(tasks) + 1
tasks[task_id] = self._create_task_with_local_uri(f, data_key, task_id)
return tasks

def from_dir_with_image_files(self, path, data_key):
return self._from_dir_with_local_resources(path, data_key, 'Image')

def from_dir_with_audio_files(self, path, data_key):
return self._from_dir_with_local_resources(path, data_key, 'Audio')
2 changes: 1 addition & 1 deletion label_studio/templates/import_help.html
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<div class="header active title"><i class="dropdown icon"></i>Import formats and examples</div>
<div class="content active">
You can use open source datasets and build tasks in
<a class="no-go" target="_blank" href='https://data.heartex.net/?data_types={{ project.data_types_json }}'>
<a class="no-go" target="_blank" href='https://data.heartex.net/?data_types={{ project.data_types_json }}&ref=label-studio&p={{ project.name }}'>
Heartex.Datasets
</a>.
<br/><br/>
Expand Down
11 changes: 9 additions & 2 deletions label_studio/utils/analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,13 @@
import os
import io
import requests
import calendar

from datetime import datetime
from mixpanel import Mixpanel, MixpanelException
from copy import deepcopy
from operator import itemgetter

from uuid import uuid4
from .misc import get_app_version, parse_config, convert_string_to_hash
from .io import get_config_dir
Expand Down Expand Up @@ -63,6 +66,9 @@ def _get_label_types(self):
})
return label_types

def _get_timestamp_now(self):
return calendar.timegm(datetime.now().timetuple())

def update_info(self, label_config_line, collect_analytics=True, project_name='', context=None):
if label_config_line != self._label_config_line:
self._label_types = self._get_label_types()
Expand All @@ -86,9 +92,10 @@ def send(self, event_name, **kwargs):

json_data = data
json_data['event'] = event_name
json_data['user_id'] = self._user_id
json_data['server_id'] = self._user_id
json_data['server_time'] = self._get_timestamp_now()
try:
url = 'https://analytics.labelstudio.io/prod'
url = 'https://analytics.labelstud.io/prod'
logger.debug('Sending to {url}:\n{data}'.format(url=url, data=json_data))
requests.post(url=url, json=json_data)
except requests.RequestException as exc:
Expand Down
Loading

0 comments on commit 9a3796c

Please sign in to comment.