Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

allow adding files using url #75

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions backend/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
requests
alembic==1.2.1
Flask==1.1.1
Flask-Login==0.4.1
Expand Down
102 changes: 98 additions & 4 deletions backend/routes/data.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
import sqlalchemy as sa
import uuid
import requests

from pathlib import Path

Expand All @@ -25,8 +26,7 @@ def send_audio_file(file_name):


def validate_segmentation(segment):
"""Validate the segmentation before accepting the annotation's upload from users
"""
"""Validate the segmentation before accepting the annotation's upload from users"""
required_key = {"start_time", "end_time", "transcription"}

if set(required_key).issubset(segment.keys()):
Expand All @@ -44,8 +44,7 @@ def generate_segmentation(
data_id,
segmentation_id=None,
):
"""Generate a Segmentation from the required segment information
"""
"""Generate a Segmentation from the required segment information"""
if segmentation_id is None:
segmentation = Segmentation(
data_id=data_id,
Expand Down Expand Up @@ -190,3 +189,98 @@ def add_data():
),
201,
)


def download_file(url, save_path=None):
local_filename = url.split("/")[-1] if save_path is None else save_path
with requests.get(url, stream=True) as r:
r.raise_for_status()
with open(local_filename, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
return local_filename


@api.route("/dataWithUrl", methods=["POST"])
def add_data_from_url():
api_key = request.headers.get("Authorization", None)

if not api_key:
raise BadRequest(description="API Key missing from `Authorization` Header")

project = Project.query.filter_by(api_key=api_key).first()

if not project:
raise NotFound(description="No project exist with given API Key")

username = request.form.get("username", None)
user = User.query.filter_by(username=username).first()

if not user:
raise NotFound(description="No user found with given username")

segmentations = request.form.get("segmentations", "[]")
reference_transcription = request.form.get("reference_transcription", None)
data_url = request.form.get("data_url", None)
is_marked_for_review = bool(request.form.get("is_marked_for_review", False))

if data_url is None:
return 404

original_filename = secure_filename(data_url.split("/")[-1])

extension = Path(original_filename).suffix.lower()

if len(extension) > 1 and extension[1:] not in ALLOWED_EXTENSIONS:
raise BadRequest(description="File format is not supported")

filename = f"{str(uuid.uuid4().hex)}{extension}"

file_path = Path(app.config["UPLOAD_FOLDER"]).joinpath(filename)
download_file(data_url, file_path.as_posix())

data = Data(
project_id=project.id,
filename=filename,
original_filename=original_filename,
reference_transcription=reference_transcription,
is_marked_for_review=is_marked_for_review,
assigned_user_id=user.id,
)
db.session.add(data)
db.session.flush()

segmentations = json.loads(segmentations)

new_segmentations = []

for segment in segmentations:
validated = validate_segmentation(segment)

if not validated:
raise BadRequest(description=f"Segmentations have missing keys.")

new_segment = generate_segmentation(
data_id=data.id,
project_id=project.id,
end_time=float(segment["end_time"]),
start_time=float(segment["start_time"]),
annotations=segment.get("annotations", {}),
transcription=segment["transcription"],
)

new_segmentations.append(new_segment)

data.set_segmentations(new_segmentations)

db.session.commit()
db.session.refresh(data)

return (
jsonify(
data_id=data.id,
message=f"Data uploaded, created and assigned successfully",
type="DATA_CREATED",
),
201,
)
21 changes: 16 additions & 5 deletions docs/tutorials/upload-data.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,29 +2,40 @@

The tool provides an end point to upload datapoints. You would need an API Key which can be found on the admin dashboard for all projects. To upload datapoints for a project, you would need to make a `POST` request to `/api/data` end point. API Key should be passed in `Authorization` header. Labels for data can also be uploaded.

For every datapoint, we need to provide the following required information:
Following are ways in which datapoints can be created and their respective requirements:

1. `audio_file`: The audio binary file of `mp3`, `wav` or `ogg` format along with filename.
2. `username`: The username to whom this audio needs to be assigned for annotation. It should be one of the users created.
1. Using remote audio file url
1. `data_url`: The URL to audio file accessable via a simple python request.
2. Using local audio files
1. `audio_file`: The audio binary file of `mp3`, `wav` or `ogg` format along with filename.
2. `username`: The username to whom this audio needs to be assigned for annotation. It should be one of the users created.

You can also provide the following optional information:

1. `reference_transcription`: Transcription of audio for reference.
2. `is_marked_for_review`: Whether this audio should be marked for review or not.
2. `is_marked_for_review`: Whether this audio should be marked for review or not.
3. `segmentations` : The list of segmentation values for the given audio.

We provide an [example CLI script](../../examples/upload_data/upload_data.py) to show how to upload the datapoints.

For example, you can add data with reference transcripts:

```sh
// creating datapoint using local audio file
API_KEY=4369e45d3a94466b8fe1efb86b8a4392 python upload_data.py --username admin --is_marked_for_review True --audio_file OSR_us_000_0010_8k.wav --host localhost --port 80 --reference_transcription "The birch canoe slid on the smooth planks. Glue the sheet to the dark blue background. It's easy to tell the depth of a well. These days a chicken leg is a rare dish. Rice is often served in round bowls. The juice of lemons makes fine punch. The box was thrown beside the parked truck. The hogs were fed chopped corn and garbage. Four hours of steady work faced us. Large size in stockings is hard to sell."
```

or

```sh
// creating datapoint using remote audio file url
API_KEY=67cf63744f0f444f98a4326f37b53b93 python3 upload_data.py --username admin --is_marked_for_review True --host localhost --port 3000 --reference_transcription "Glue the sheet to the dark blue background." --data_url "https://static.wikia.nocookie.net/soundeffects/images/3/31/Bird_Singing_Chirp_Sound.ogg/revision/latest?cb=20210122103806"
```

or

add data with segmentation values:

```sh
API_KEY=cb0ac22ca0404fd19e89162bee8c462b python upload_data.py --username admin --is_marked_for_review True --audio_file OSR_us_000_0010_8k.wav --host localhost --port 5000 --segmentations '[ { "annotations": { "testing this": { "values": [ "4", "5" ] } }, "end_time": 7.7407, "start_time": 3.8604, "transcription": "Sample transcription data" }, { "end_time": 17.7407, "start_time": 13.8604, "transcription": "Sample transcription data" }]'
```
```
45 changes: 32 additions & 13 deletions examples/upload_data/upload_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,27 +39,25 @@
help="List of segmentations for the audio",
default=[],
)
parser.add_argument(
"--data_url",
type=str,
help="Url of ",
default="",
)
parser.add_argument("--port", type=int, help="Port to make request to", default=80)

args = parser.parse_args()

api_key = os.getenv("API_KEY", None)
headers = {"Authorization": api_key}

audio_path = Path(args.audio_file)
audio_filename = audio_path.name
if audio_path.is_file():
audio_obj = open(audio_path.resolve(), "rb")
else:
print("Audio file does not exist")
exit()

data_url = args.data_url
reference_transcription = args.reference_transcription
username = args.username
is_marked_for_review = args.is_marked_for_review
segmentations = args.segmentations

file = {"audio_file": (audio_filename, audio_obj)}

values = {
"reference_transcription": reference_transcription,
Expand All @@ -68,10 +66,31 @@
"is_marked_for_review": is_marked_for_review,
}

print("Creating datapoint")
response = requests.post(
f"http://{args.host}:{args.port}/api/data", files=file, data=values, headers=headers
)

print("Creating datapoint {}".format(f"from url: {data_url}" if data_url else ""))

if data_url:
values.update({"data_url": data_url})
response = requests.post(
f"http://{args.host}:{args.port}/api/dataWithUrl", data=values, headers=headers
)
else:
audio_path = Path(args.audio_file)
audio_filename = audio_path.name
if audio_path.is_file():
audio_obj = open(audio_path.resolve(), "rb")
else:
print("Audio file does not exist")
exit()
file = {"audio_file": (audio_filename, audio_obj)}

response = requests.post(
f"http://{args.host}:{args.port}/api/data",
files=file,
data=values,
headers=headers,
)


if response.status_code == 201:
response_json = response.json()
Expand Down