Skip to content

Commit

Permalink
added mistral and LLama generators
Browse files Browse the repository at this point in the history
  • Loading branch information
Michael Fuest committed Sep 11, 2024
1 parent bfd769e commit 16cc19f
Show file tree
Hide file tree
Showing 5 changed files with 84 additions and 194 deletions.
7 changes: 0 additions & 7 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ jobs:
matrix:
python-version: [3.8, 3.9]
os: [ubuntu-latest, macos-latest]
max-parallel: 1 # Run jobs sequentially (one at a time)

steps:
- uses: actions/checkout@v3
Expand All @@ -39,16 +38,10 @@ jobs:
python -m pip install --upgrade pip
pip install tox tox-gh-actions
- name: Clear pip cache to save disk space
run: pip cache purge

# Run tests with tox
- name: Test with tox
run: tox

- name: Check disk space after installation
run: df -h

# Upload coverage to Codecov
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
Expand Down
File renamed without changes.
3 changes: 3 additions & 0 deletions generator/llm/hf_prompt_template.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"prompt_template": "You are a helpful assistant that generates synthetic time series data representing household-level electricity load profiles with measurements taken in 15 min intervals throughout the day. The user will provide an example time series for a month and weekday combination. Your task is to generate a synthetic time series sampled from the same distribution with EXACTLY {length} NONZERO values separated by commas. It is crucial that you generate precisely {length} numeric values, no more and no less. Do not generate any additional text, just a list of exactly {length} comma-separated numeric nonzero values. The generated time series should look similar to this example: {example_ts}"
}
253 changes: 72 additions & 181 deletions generator/llm/llm.py
Original file line number Diff line number Diff line change
@@ -1,154 +1,32 @@
# -*- coding: utf-8 -*-

import json
import logging
import os

import tiktoken
import numpy as np
import torch
from openai import OpenAI
from transformers import AutoModelForCausalLM, AutoTokenizer

PROMPT_PATH = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "gpt_messages.json"
)

PROMPTS = json.load(open(PROMPT_PATH))
from generator.llm.preprocessing import Signal2String

HF_PROMPT_PATH = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "hf_prompt_template.json"
)
VALID_NUMBERS = list("0123456789")
BIAS = 30

LOGGER = logging.getLogger(__name__)

DEFAULT_BOS_TOKEN = "<|begin_of_text|>"
DEFAULT_EOS_TOKEN = "<|end_of_text|>"
DEFAULT_UNK_TOKEN = "<unk>"
DEFAULT_PAD_TOKEN = "<pad>"

DEFAULT_MODEL = "meta-llama/Meta-Llama-3.1-8B"
MISTRAL_MODEL = "mistralai/Mistral-7B-Instruct-v0.2"


class GPT:
"""Prompt GPT models to generate a time series.
Args:
name (str):
Model name. Default to `'gpt-4o'`.
chat (bool):
Whether you're using a chat model or not. Default to `True`.
sep (str):
String to separate each element in values. Default to `','`.
"""

def __init__(self, name="gpt-4o", chat=True, sep=","):
self.name = name
self.chat = chat
self.sep = sep

self.client = OpenAI()
self.tokenizer = tiktoken.encoding_for_model(self.name)

valid_tokens = []
for number in VALID_NUMBERS:
token = self.tokenizer.encode(number)
valid_tokens.extend(token)

valid_tokens.extend(self.tokenizer.encode(self.sep))
self.logit_bias = {token: BIAS for token in valid_tokens}

def generate(
self,
text,
length=96,
temp=1,
top_p=1,
logprobs=False,
top_logprobs=None,
samples=1,
seed=None,
):
"""Use GPT to generate a signal.
Args:
text (str):
A string containing an example time series.
length (int):
Desired length of the generated time series.
temp (float):
Sampling temperature to use, between 0 and 2. Higher values like 0.8 will
make the output more random, while lower values like 0.2 will make it
more focused and deterministic. Do not use with `top_p`. Default to `1`.
top_p (float):
Alternative to sampling with temperature, called nucleus sampling, where the
model considers the results of the tokens with top_p probability mass.
So 0.1 means only the tokens comprising the top 10% probability mass are
considered. Do not use with `temp`. Default to `1`.
logprobs (bool):
Whether to return the log probabilities of the output tokens or not.
Defaults to `False`.
top_logprobs (int):
An integer between 0 and 20 specifying the number of most likely tokens
to return at each token position. Default to `None`.
samples (int):
Number of forecasts to generate for each input message. Default to `1`.
seed (int):
Beta feature by OpenAI to sample deterministically. Default to `None`.
Returns:
list, list:
* List of generated signal values.
* Optionally, a list of the output tokens' log probabilities.
"""
input_length = len(self.tokenizer.encode(text))
average_length = (input_length + 1) // len(text.split(","))
max_tokens = average_length * length

if self.chat:
message = " ".join([PROMPTS["user_message"], text, self.sep])
response = self.client.chat.completions.create(
model=self.name,
messages=[
{"role": "system", "content": PROMPTS["system_message"]},
{"role": "user", "content": message},
],
max_tokens=max_tokens,
temperature=temp,
logprobs=logprobs,
top_logprobs=top_logprobs,
n=samples,
)
responses = [choice.message.content for choice in response.choices]

else:
message = " ".join(text, self.sep)
response = self.client.completions.create(
model=self.name,
prompt=message,
max_tokens=max_tokens,
temperature=temp,
logprobs=logprobs,
top_logprobs=top_logprobs,
logit_bias=self.logit_bias,
n=samples,
)
responses = [choice.text for choice in response.choices]

if logprobs:
probs = [choice.logprobs for choice in response.choices]
return responses, probs

return responses
LOGGER = logging.getLogger(__name__)


class HF:
"""Prompt Pretrained models on HuggingFace to forecast a time series.
Args:
name (str):
Model name. Default to `'meta-llama/Meta-Llama-3.1-8B'`.
sep (str):
String to separate each element in values. Default to `','`.
name (str): Model name. Default to `'meta-llama/Meta-Llama-3.1-8B'`.
sep (str): String to separate each element in values. Default to `','`.
"""

def __init__(self, name=DEFAULT_MODEL, sep=","):
Expand All @@ -157,7 +35,6 @@ def __init__(self, name=DEFAULT_MODEL, sep=","):

self.tokenizer = AutoTokenizer.from_pretrained(self.name, use_fast=False)

# special tokens
if name == MISTRAL_MODEL:
special_tokens_dict = dict()
if self.tokenizer.eos_token is None:
Expand All @@ -173,71 +50,57 @@ def __init__(self, name=DEFAULT_MODEL, sep=","):

self.tokenizer.pad_token = (
self.tokenizer.eos_token
) # indicate the end of the time series

# invalid tokens
valid_tokens = []
for number in VALID_NUMBERS:
token = self.tokenizer.convert_tokens_to_ids(number)
valid_tokens.append(token)
) # Indicate end of the time series

valid_tokens.append(self.tokenizer.convert_tokens_to_ids(self.sep))
self.invalid_tokens = [
[i] for i in range(len(self.tokenizer) - 1) if i not in valid_tokens
# Define invalid tokens (tokens that are not digits or commas)
valid_tokens = [
self.tokenizer.convert_tokens_to_ids(str(digit)) for digit in VALID_NUMBERS
]
valid_tokens.append(self.tokenizer.convert_tokens_to_ids(self.sep))
vocab_size = self.tokenizer.vocab_size

if any(token >= vocab_size for token in valid_tokens):
raise ValueError(
f"Some valid tokens are outside the model's vocabulary size of {vocab_size}"
)

self.invalid_tokens = [[i] for i in range(vocab_size) if i not in valid_tokens]

# Load the model
self.model = AutoModelForCausalLM.from_pretrained(
self.name,
device_map="auto",
torch_dtype=torch.bfloat16,
self.name, device_map="auto", torch_dtype=torch.bfloat16
)

self.model.eval()

def load_prompt_template(self):
"""Load the prompt template from a JSON file."""
with open(HF_PROMPT_PATH) as f:
template = json.load(f)["prompt_template"]
return template

def generate_timeseries(
self, text, length=96, temp=1, top_p=1, raw=False, samples=1, padding=0
self, example_ts, length=96, temp=1, top_p=1, raw=False, samples=1, padding=0
):
"""Use the model to generate a signal.
Args:
text (str):
A string containing an example time series.
length (int):
Desired length of the generated time series.
temp (float):
The value used to modulate the next token probabilities. Default to `1`.
top_p (float):
If set to float < 1, only the smallest set of most probable tokens with
probabilities that add up to `top_p` or higher are kept for generation.
Default to `1`.
raw (bool):
Whether to return the raw output or not. Defaults to `False`.
samples (int):
Number of forecasts to generate for each input message. Default to `1`.
padding (int):
Additional padding token to forecast to reduce short horizon predictions.
Default to `0`.
"""Generate a time series forecast."""
template = self.load_prompt_template()
prompt = template.format(length=length, example_ts=example_ts)

Returns:
list, list:
* List of forecasted signal values.
* Optionally, a list of dictionaries for raw output.
"""
tokenized_input = self.tokenizer([text], return_tensors="pt").to("cuda")
tokenized_input = self.tokenizer([prompt], return_tensors="pt").to("cuda")
tokenized_ts = self.tokenizer([example_ts], return_tensors="pt").to("cuda")

input_length = tokenized_input["input_ids"].shape[1]
average_length = input_length / len(text.split(","))
input_length = tokenized_ts["input_ids"].shape[1]
average_length = input_length / len(example_ts.split(","))
max_tokens = (average_length + padding) * length

generate_ids = self.model.generate(
**tokenized_input,
do_sample=True,
max_new_tokens=max_tokens,
max_new_tokens=int(max_tokens),
temperature=temp,
top_p=top_p,
# bad_words_ids=self.invalid_tokens,
renormalize_logits=True,
num_return_sequences=samples
bad_words_ids=self.invalid_tokens, # Ensure no invalid tokens (only digits and commas)
num_return_sequences=samples,
)

responses = self.tokenizer.batch_decode(
Expand All @@ -249,17 +112,45 @@ def generate_timeseries(
if raw:
return responses, generate_ids

return responses
processed_responses = []
for response in responses:
values = response.split(self.sep)
values = [
v.strip() for v in values if v.strip().replace(".", "", 1).isdigit()
] # Remove invalid entries
processed_responses.append(self.sep.join(values))

return processed_responses

def train_model(self, dataset):
"""Train the model on the given dataset."""
self.dataset = dataset.data

def generate(self, day_labels, month_labels):
"""Generate time series based on weekday and month labels."""
assert (
day_labels.shape == month_labels.shape
), "Number of weekday and month labels must be equal!"

gen_ts_dataset = []

for day, month in zip(day_labels, month_labels):
example_ts = self.dataset[
self.dataset["weekday"] == "day" & self.dataset["month"] == "month"
]
example_ts = (
self.dataset[
(self.dataset["weekday"] == day.item())
& (self.dataset["month"] == month.item())
]
.sample(1)
.timeseries.values[0][:, 0]
)

converter = Signal2String(decimal=4)
example_ts_string = converter.transform(example_ts)
gen_ts_string = self.generate_timeseries(example_ts_string)
gen_ts = converter.reverse_transform(
gen_ts_string[0], trunc=example_ts.shape[0]
)
gen_ts_dataset.append(gen_ts)

gen_ts_dataset = torch.tensor(np.array(gen_ts_dataset))
return gen_ts_dataset
15 changes: 9 additions & 6 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import torch

from data_utils.openpower import OpenPowerDataset
from data_utils.pecanstreet import PecanStreetDataset
from eval.evaluator import Evaluator
from generator.llm.llm import HF
from generator.llm.preprocessing import Signal2String


def evaluate_individual_user_models(
Expand Down Expand Up @@ -30,20 +31,22 @@ def evaluate_single_dataset_model(


def evaluate_llm():
hf = HF(name="meta-llama/Meta-Llama-3.1-8B", sep=",")
hf = HF(name="mistralai/Mistral-7B-Instruct-v0.2", sep=",")
full_dataset = PecanStreetDataset(
normalize=True, include_generation=False, threshold=(-5, 5)
)
user_dataset = full_dataset.create_user_dataset(661)
row = user_dataset.data.iloc[0]
weekdays = [row.weekday]
months = [row.month]
weekdays = torch.tensor([row.weekday])
months = torch.tensor([row.month])
hf.train_model(user_dataset)
hf.generate(weekdays, months)


def main():
evaluate_individual_user_models("diffusion_ts", "newyork")
# evaluate_single_dataset_model("acgan", "newyork")
# evaluate_individual_user_models("diffusion_ts", "newyork")
# evaluate_single_dataset_model("diffusion_ts", "austin")
evaluate_llm()


if __name__ == "__main__":
Expand Down

0 comments on commit 16cc19f

Please sign in to comment.