From bf9eef39cefdcb091dc9626c0537eacaefaaaa64 Mon Sep 17 00:00:00 2001
From: Danrui Qi <qidanrui@gmail.com>
Date: Thu, 28 Dec 2023 19:23:45 +0800
Subject: [PATCH] PLEASE DO NOT MERGE!! Add pylintrc (still need to reformat
 existing code manually to pass pylint checking) (#199)

---
 .github/workflows/ci.yml                      |   4 +-
 .pylintrc                                     | 571 ++++++++++++++++++
 dbgpt_hub/data_generator/__init__.py          |   8 +
 dbgpt_hub/data_generator/gpt_generator.py     | 143 +++++
 dbgpt_hub/data_generator/gpt_generator_api.py |  41 ++
 dbgpt_hub/data_generator/llm_generator.py     |  24 +
 dbgpt_hub/data_generator/utils.py             |  35 ++
 poetry.lock                                   |  34 ++
 pyproject.toml                                |   3 +-
 9 files changed, 860 insertions(+), 3 deletions(-)
 create mode 100644 .pylintrc
 create mode 100644 dbgpt_hub/data_generator/__init__.py
 create mode 100644 dbgpt_hub/data_generator/gpt_generator.py
 create mode 100644 dbgpt_hub/data_generator/gpt_generator_api.py
 create mode 100644 dbgpt_hub/data_generator/llm_generator.py
 create mode 100644 dbgpt_hub/data_generator/utils.py

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 4fb44e0..9bd7d7a 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -59,8 +59,8 @@ jobs:
       # - name: Type check the project
       #   run: poetry run pyright dbgpt_hub || true
 
-      # - name: Style check the project
-      #   run: poetry run pylint dbgpt_hub || true
+      - name: Style check the project
+        run: poetry run pylint dbgpt_hub || true
 
       - name: Build binary dependencies
         run: poetry build
\ No newline at end of file
diff --git a/.pylintrc b/.pylintrc
new file mode 100644
index 0000000..2b10e9b
--- /dev/null
+++ b/.pylintrc
@@ -0,0 +1,571 @@
+[MASTER]
+
+# A comma-separated list of package or module names from where C extensions may
+# be loaded. Extensions are loading into the active Python interpreter and may
+# run arbitrary code.
+extension-pkg-whitelist=lxml, pandas._libs.missing
+
+# Add files or directories to the blacklist. They should be base names, not
+# paths.
+ignore=CVS,tests
+
+# Add files or directories matching the regex patterns to the blacklist. The
+# regex matches against base names, not paths.
+ignore-patterns=
+
+# Python code to execute, usually for sys.path manipulation such as
+# pygtk.require().
+init-hook='import sys; sys.setrecursionlimit(8 * sys.getrecursionlimit())'
+
+# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
+# number of processors available to use.
+jobs=1
+
+# Control the amount of potential inferred values when inferring a single
+# object. This can help the performance when dealing with large functions or
+# complex, nested conditions.
+limit-inference-results=100
+
+# List of plugins (as comma separated values of python modules names) to load,
+# usually to register additional checkers.
+load-plugins=
+
+# Pickle collected data for later comparisons.
+persistent=yes
+
+# Specify a configuration file.
+#rcfile=
+
+# When enabled, pylint would attempt to guess common misconfiguration and emit
+# user-friendly hints instead of false-positive error messages.
+suggestion-mode=yes
+
+# Allow loading of arbitrary C extensions. Extensions are imported into the
+# active Python interpreter and may run arbitrary code.
+unsafe-load-any-extension=no
+
+
+[MESSAGES CONTROL]
+
+# Only show warnings with the listed confidence levels. Leave empty to show
+# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED.
+confidence=
+
+# Disable the message, report, category or checker with the given id(s). You
+# can either give multiple identifiers separated by comma (,) or put this
+# option multiple times (only on the command line, not in the configuration
+# file where it should appear only once). You can also use "--disable=all" to
+# disable everything first and then reenable specific checks. For example, if
+# you want to run only the similarities checker, you can use "--disable=all
+# --enable=similarities". If you want to run only the classes checker, but have
+# no Warning level messages displayed, use "--disable=all --enable=classes
+# --disable=W".
+disable=print-statement,
+        parameter-unpacking,
+        unpacking-in-except,
+        old-raise-syntax,
+        backtick,
+        long-suffix,
+        old-ne-operator,
+        old-octal-literal,
+        import-star-module-level,
+        non-ascii-bytes-literal,
+        raw-checker-failed,
+        bad-inline-option,
+        locally-disabled,
+        file-ignored,
+        suppressed-message,
+        useless-suppression,
+        deprecated-pragma,
+        use-symbolic-message-instead,
+        apply-builtin,
+        basestring-builtin,
+        buffer-builtin,
+        cmp-builtin,
+        coerce-builtin,
+        execfile-builtin,
+        file-builtin,
+        long-builtin,
+        raw_input-builtin,
+        reduce-builtin,
+        standarderror-builtin,
+        unicode-builtin,
+        xrange-builtin,
+        coerce-method,
+        delslice-method,
+        getslice-method,
+        setslice-method,
+        no-absolute-import,
+        old-division,
+        dict-iter-method,
+        dict-view-method,
+        next-method-called,
+        metaclass-assignment,
+        indexing-exception,
+        raising-string,
+        reload-builtin,
+        oct-method,
+        hex-method,
+        nonzero-method,
+        cmp-method,
+        input-builtin,
+        round-builtin,
+        intern-builtin,
+        unichr-builtin,
+        map-builtin-not-iterating,
+        zip-builtin-not-iterating,
+        range-builtin-not-iterating,
+        filter-builtin-not-iterating,
+        using-cmp-argument,
+        eq-without-hash,
+        div-method,
+        idiv-method,
+        rdiv-method,
+        exception-message-attribute,
+        invalid-str-codec,
+        sys-max-int,
+        bad-python3-import,
+        deprecated-string-function,
+        deprecated-str-translate-call,
+        deprecated-itertools-function,
+        deprecated-types-field,
+        next-method-defined,
+        dict-items-not-iterating,
+        dict-keys-not-iterating,
+        dict-values-not-iterating,
+        deprecated-operator-function,
+        deprecated-urllib-function,
+        xreadlines-attribute,
+        deprecated-sys-function,
+        exception-escape,
+        comprehension-escape,
+        bad-continuation,
+        ungrouped-imports,
+        too-few-public-methods,
+        redefined-outer-name,
+        no-else-return,
+        unsubscriptable-object
+
+# Enable the message, report, category or checker with the given id(s). You can
+# either give multiple identifier separated by comma (,) or put this option
+# multiple time (only on the command line, not in the configuration file where
+# it should appear only once). See also the "--disable" option for examples.
+enable=c-extension-no-member
+
+
+[REPORTS]
+
+# Python expression which should return a note less than 10 (10 is the highest
+# note). You have access to the variables errors warning, statement which
+# respectively contain the number of errors / warnings messages and the total
+# number of statements analyzed. This is used by the global evaluation report
+# (RP0004).
+evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
+
+# Template used to display messages. This is a python new-style format string
+# used to format the message information. See doc for all details.
+#msg-template=
+
+# Set the output format. Available formats are text, parseable, colorized, json
+# and msvs (visual studio). You can also give a reporter class, e.g.
+# mypackage.mymodule.MyReporterClass.
+output-format=text
+
+# Tells whether to display a full report or only the messages.
+reports=no
+
+# Activate the evaluation score.
+score=yes
+
+
+[REFACTORING]
+
+# Maximum number of nested blocks for function / method body
+max-nested-blocks=5
+
+# Complete name of functions that never returns. When checking for
+# inconsistent-return-statements if a never returning function is called then
+# it will be considered as an explicit return statement and no message will be
+# printed.
+never-returning-functions=sys.exit
+
+
+[SPELLING]
+
+# Limits count of emitted suggestions for spelling mistakes.
+max-spelling-suggestions=4
+
+# Spelling dictionary name. Available dictionaries: none. To make it working
+# install python-enchant package..
+spelling-dict=
+
+# List of comma separated words that should not be checked.
+spelling-ignore-words=
+
+# A path to a file that contains private dictionary; one word per line.
+spelling-private-dict-file=
+
+# Tells whether to store unknown words to indicated private dictionary in
+# --spelling-private-dict-file option instead of raising a message.
+spelling-store-unknown-words=no
+
+
+[FORMAT]
+
+# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
+expected-line-ending-format=
+
+# Regexp for a line that is allowed to be longer than the limit.
+ignore-long-lines=^\s*(# )?<?https?://\S+>?$
+
+# Number of spaces of indent required inside a hanging or continued line.
+indent-after-paren=4
+
+# String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
+# tab).
+indent-string='    '
+
+# Maximum number of characters on a single line.
+max-line-length=100
+
+# Maximum number of lines in a module.
+max-module-lines=1000
+
+# List of optional constructs for which whitespace checking is disabled. `dict-
+# separator` is used to allow tabulation in dicts, etc.: {1  : 1,\n222: 2}.
+# `trailing-comma` allows a space between comma and closing bracket: (a, ).
+# `empty-line` allows space-only lines.
+no-space-check=trailing-comma,
+               dict-separator
+
+# Allow the body of a class to be on the same line as the declaration if body
+# contains single statement.
+single-line-class-stmt=no
+
+# Allow the body of an if to be on the same line as the test if there is no
+# else.
+single-line-if-stmt=no
+
+
+[LOGGING]
+
+# Format style used to check logging format string. `old` means using %
+# formatting, while `new` is for `{}` formatting.
+logging-format-style=old
+
+# Logging modules to check that the string format arguments are in logging
+# function parameter format.
+logging-modules=logging
+
+
+[SIMILARITIES]
+
+# Ignore comments when computing similarities.
+ignore-comments=no
+
+# Ignore docstrings when computing similarities.
+ignore-docstrings=yes
+
+# Ignore imports when computing similarities.
+ignore-imports=no
+
+# Minimum lines number of a similarity.
+min-similarity-lines=100
+
+[VARIABLES]
+
+# List of additional names supposed to be defined in builtins. Remember that
+# you should avoid defining new builtins when possible.
+additional-builtins=
+
+# Tells whether unused global variables should be treated as a violation.
+allow-global-unused-variables=yes
+
+# List of strings which can identify a callback function by name. A callback
+# name must start or end with one of those strings.
+callbacks=cb_,
+          _cb
+
+# A regular expression matching the name of dummy variables (i.e. expected to
+# not be used).
+dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
+
+# Argument names that match this expression will be ignored. Default to name
+# with leading underscore.
+ignored-argument-names=_.*|^ignored_|^unused_
+
+# Tells whether we should check for unused import in __init__ files.
+init-import=no
+
+# List of qualified module names which can have objects that can redefine
+# builtins.
+redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
+
+
+[STRING]
+
+# This flag controls whether the implicit-str-concat-in-sequence should
+# generate a warning on implicit string concatenation in sequences defined over
+# several lines.
+check-str-concat-over-line-jumps=no
+
+
+[TYPECHECK]
+
+# List of decorators that produce context managers, such as
+# contextlib.contextmanager. Add to this list to register other decorators that
+# produce valid context managers.
+contextmanager-decorators=contextlib.contextmanager
+
+# List of members which are set dynamically and missed by pylint inference
+# system, and so shouldn't trigger E1101 when accessed. Python regular
+# expressions are accepted.
+generated-members=
+
+# Tells whether missing members accessed in mixin class should be ignored. A
+# mixin class is detected if its name ends with "mixin" (case insensitive).
+ignore-mixin-members=yes
+
+# Tells whether to warn about missing members when the owner of the attribute
+# is inferred to be None.
+ignore-none=yes
+
+# This flag controls whether pylint should warn about no-member and similar
+# checks whenever an opaque object is returned when inferring. The inference
+# can return multiple potential results while evaluating a Python object, but
+# some branches might not be evaluated, which results in partial inference. In
+# that case, it might be useful to still emit no-member and other checks for
+# the rest of the inferred objects.
+ignore-on-opaque-inference=yes
+
+# List of class names for which member attributes should not be checked (useful
+# for classes with dynamically set attributes). This supports the use of
+# qualified names.
+ignored-classes=optparse.Values,thread._local,_thread._local
+
+# List of module names for which member attributes should not be checked
+# (useful for modules/projects where namespaces are manipulated during runtime
+# and thus existing member attributes cannot be deduced by static analysis. It
+# supports qualified module names, as well as Unix pattern matching.
+ignored-modules=
+
+# Show a hint with possible names when a member name was not found. The aspect
+# of finding the hint is based on edit distance.
+missing-member-hint=yes
+
+# The minimum edit distance a name should have in order to be considered a
+# similar match for a missing member name.
+missing-member-hint-distance=1
+
+# The total number of similar names that should be taken in consideration when
+# showing a hint for a missing member.
+missing-member-max-choices=1
+
+[MISCELLANEOUS]
+
+# List of note tags to take in consideration, separated by a comma.
+notes=XXX
+
+
+[BASIC]
+
+# Naming style matching correct argument names.
+argument-naming-style=snake_case
+
+# Regular expression matching correct argument names. Overrides argument-
+# naming-style.
+#argument-rgx=
+
+# Naming style matching correct attribute names.
+attr-naming-style=snake_case
+
+# Regular expression matching correct attribute names. Overrides attr-naming-
+# style.
+#attr-rgx=
+
+# Bad variable names which should always be refused, separated by a comma.
+bad-names=foo,
+          bar,
+          baz,
+          toto,
+          tutu,
+          tata
+
+# Naming style matching correct class attribute names.
+class-attribute-naming-style=any
+
+# Regular expression matching correct class attribute names. Overrides class-
+# attribute-naming-style.
+#class-attribute-rgx=
+
+# Naming style matching correct class names.
+class-naming-style=PascalCase
+
+# Regular expression matching correct class names. Overrides class-naming-
+# style.
+#class-rgx=
+
+# Naming style matching correct constant names.
+const-naming-style=UPPER_CASE
+
+# Regular expression matching correct constant names. Overrides const-naming-
+# style.
+#const-rgx=
+
+# Minimum line length for functions/classes that require docstrings, shorter
+# ones are exempt.
+docstring-min-length=-1
+
+# Naming style matching correct function names.
+function-naming-style=snake_case
+
+# Regular expression matching correct function names. Overrides function-
+# naming-style.
+#function-rgx=
+
+# Good variable names which should always be accepted, separated by a comma.
+good-names=i, j, k,
+           ex,
+           Run,
+           df, f, x, y, z, ys, xs,n, x2, y2, xy,
+           _
+
+# Include a hint for the correct naming format with invalid-name.
+include-naming-hint=no
+
+# Naming style matching correct inline iteration names.
+inlinevar-naming-style=any
+
+# Regular expression matching correct inline iteration names. Overrides
+# inlinevar-naming-style.
+#inlinevar-rgx=
+
+# Naming style matching correct method names.
+method-naming-style=snake_case
+
+# Regular expression matching correct method names. Overrides method-naming-
+# style.
+#method-rgx=
+
+# Naming style matching correct module names.
+module-naming-style=snake_case
+
+# Regular expression matching correct module names. Overrides module-naming-
+# style.
+#module-rgx=
+
+# Colon-delimited sets of names that determine each other's naming style when
+# the name regexes allow several styles.
+name-group=
+
+# Regular expression which should only match function or class names that do
+# not require a docstring.
+no-docstring-rgx=^_
+
+# List of decorators that produce properties, such as abc.abstractproperty. Add
+# to this list to register other decorators that produce valid properties.
+# These decorators are taken in consideration only for invalid-name.
+property-classes=abc.abstractproperty
+
+# Naming style matching correct variable names.
+variable-naming-style=snake_case
+
+# Regular expression matching correct variable names. Overrides variable-
+# naming-style.
+#variable-rgx=
+
+
+[CLASSES]
+
+# List of method names used to declare (i.e. assign) instance attributes.
+defining-attr-methods=__init__,
+                      __new__,
+                      setUp
+
+# List of member names, which should be excluded from the protected access
+# warning.
+exclude-protected=_asdict,
+                  _fields,
+                  _replace,
+                  _source,
+                  _make
+
+# List of valid names for the first argument in a class method.
+valid-classmethod-first-arg=cls
+
+# List of valid names for the first argument in a metaclass class method.
+valid-metaclass-classmethod-first-arg=cls
+
+
+[IMPORTS]
+
+# Allow wildcard imports from modules that define __all__.
+allow-wildcard-with-all=no
+
+# Analyse import fallback blocks. This can be used to support both Python 2 and
+# 3 compatible code, which means that the block might have code that exists
+# only in one or another interpreter, leading to false positives when analysed.
+analyse-fallback-blocks=no
+
+# Deprecated modules which should not be used, separated by a comma.
+deprecated-modules=optparse,tkinter.tix
+
+# Create a graph of external dependencies in the given file (report RP0402 must
+# not be disabled).
+ext-import-graph=
+
+# Create a graph of every (i.e. internal and external) dependencies in the
+# given file (report RP0402 must not be disabled).
+import-graph=
+
+# Create a graph of internal dependencies in the given file (report RP0402 must
+# not be disabled).
+int-import-graph=
+
+# Force import order to recognize a module as part of the standard
+# compatibility libraries.
+known-standard-library=
+
+# Force import order to recognize a module as part of a third party library.
+known-third-party=enchant
+
+
+[DESIGN]
+
+# Maximum number of arguments for function / method.
+max-args=5
+
+# Maximum number of attributes for a class (see R0902).
+max-attributes=7
+
+# Maximum number of boolean expressions in an if statement.
+max-bool-expr=5
+
+# Maximum number of branch for function / method body.
+max-branches=12
+
+# Maximum number of locals for function / method body.
+max-locals=15
+
+# Maximum number of parents for a class (see R0901).
+max-parents=7
+
+# Maximum number of public methods for a class (see R0904).
+max-public-methods=20
+
+# Maximum number of return / yield for function / method body.
+max-returns=6
+
+# Maximum number of statements in function / method body.
+max-statements=50
+
+# Minimum number of public methods for a class (see R0903).
+min-public-methods=2
+
+
+[EXCEPTIONS]
+
+# Exceptions that will emit a warning when being caught. Defaults to
+# "BaseException, Exception".
+overgeneral-exceptions=BaseException,
+                       Exception
\ No newline at end of file
diff --git a/dbgpt_hub/data_generator/__init__.py b/dbgpt_hub/data_generator/__init__.py
new file mode 100644
index 0000000..dee6f5c
--- /dev/null
+++ b/dbgpt_hub/data_generator/__init__.py
@@ -0,0 +1,8 @@
+"""
+dbgpt_hub.data_generator
+==============
+"""
+
+from .gpt_generator_api import generate_dataset_with_gpt
+
+__all__ = ["generate_dataset_with_gpt"]
diff --git a/dbgpt_hub/data_generator/gpt_generator.py b/dbgpt_hub/data_generator/gpt_generator.py
new file mode 100644
index 0000000..3023791
--- /dev/null
+++ b/dbgpt_hub/data_generator/gpt_generator.py
@@ -0,0 +1,143 @@
+from openai import OpenAI
+
+import os
+import json
+
+from tqdm import tqdm
+
+from .llm_generator import LLMGenerator
+from .utils import COT_PROMPT, FEW_SHOTS_EXAMPLE
+
+
+class GPTGenerator(LLMGenerator):
+    def __init__(
+        self,
+        model: str = "gpt-3.5-turbo-16k",
+        model_temperature: int = 0,
+        max_tokens: int = 2048,
+        prompt: str = "",
+        num_text2sql_pair_each_db: int = 10,
+        table_file_path: str = "",
+        output_path: str = "",
+    ):
+        if len(table_file_path) > 0:
+            self.table_file_path = table_file_path
+        else:
+            self.table_file_path = "../data/spider/tables.json"
+
+        if len(output_path) > 0:
+            self.output_path = output_path
+        else:
+            self.output_path = "../data/spider/synthetic_data_with_gpt.json"
+
+        if len(prompt) > 0:
+            self.prompt = prompt
+        else:
+            self.prompt = COT_PROMPT
+        self.model = model
+        self.model_temperature = model_temperature
+        self.max_tokens = max_tokens
+        self.num_text2sql_pair_each_db = num_text2sql_pair_each_db
+
+        self.synthetic_dataset = []
+
+    def generate_synthetic_dataset(self):
+        """Function for generating synthetic dataset.
+        By default, we generate Spider-like synthetic dataset.
+        """
+        schema = ""
+        synthetic_dataset = []
+
+        tables = json.load(open(self.table_file_path))
+        db_num = len(tables)
+        easy_count = int(self.num_text2sql_pair_each_db / db_num)
+        medium_count = int(self.num_text2sql_pair_each_db / db_num)
+        hard_count = self.num_text2sql_pair_each_db - easy_count - medium_count
+
+        db_dict = {}
+        for item in tqdm(tables[:]):
+            tables = item["table_names_original"]
+            coloumns = item["column_names_original"][1:]
+            primary_key = item["primary_keys"]
+            foreign_keys = item["foreign_keys"]
+            schema = (
+                item["db_id"]
+                + " database contains tables such as "
+                + ", ".join(tables)
+                + ". "
+            )
+            for i, name in enumerate(tables):
+                data = [coloumn[1] for coloumn in coloumns if coloumn[0] == i]
+                schema += (
+                    "Table " + name + " has columns such as " + ", ".join(data) + ". "
+                )
+
+                # get primary key info
+                for j in range(len(primary_key)):
+                    if coloumns[primary_key[j] - 1][0] == i:
+                        schema += (
+                            coloumns[primary_key[j] - 1][1]
+                            + " is the primary key."
+                            + "\n"
+                        )
+
+                # get foreign key info
+                for key in foreign_keys:
+                    schema += (
+                        "The "
+                        + coloumns[key[0] - 1][1]
+                        + " of "
+                        + tables[coloumns[key[0] - 1][0]]
+                        + " is the foreign key of "
+                        + coloumns[key[1] - 1][1]
+                        + " of "
+                        + tables[coloumns[key[1] - 1][0]]
+                        + ".\n"
+                    )
+
+            db_dict[item["db_id"]] = schema
+
+            try:
+                # Single generated data for one DB
+                for k in range(self.num_text2sql_pair_each_db):
+                    text2sql_pair = self._chat_llm(
+                        self.prompt.format(
+                            easy_count=easy_count,
+                            medium_count=medium_count,
+                            hard_count=hard_count,
+                            schema=schema,
+                            few_shots_example=FEW_SHOTS_EXAMPLE,
+                        )
+                    )
+                    text2sql_pair = eval(text2sql_pair)
+                    synthetic_dataset += text2sql_pair
+            except:
+                continue
+
+        self.synthetic_dataset = synthetic_dataset
+        self._writeout_dataset()
+
+    def _chat_llm(self, prompt):
+        client = OpenAI(
+            api_key=os.environ["OPENAI_API_KEY"],
+        )
+
+        completion = client.chat.completions.create(
+            messages=[
+                {
+                    "role": "user",
+                    "content": prompt,
+                }
+            ],
+            model=self.model,
+            temperature=self.model_temperature,
+            max_tokens=self.max_tokens,
+            top_p=1,
+            frequency_penalty=0,
+            presence_penalty=0,
+        )
+        return completion.choices[0].message.content
+
+    def _writeout_dataset(self):
+        with open(self.output_path, "w", encoding="utf-8") as s:
+            json.dump(self.synthetic_dataset, s, indent=4, ensure_ascii=False)
diff --git a/dbgpt_hub/data_generator/gpt_generator_api.py b/dbgpt_hub/data_generator/gpt_generator_api.py
new file mode 100644
index 0000000..d34200f
--- /dev/null
+++ b/dbgpt_hub/data_generator/gpt_generator_api.py
@@ -0,0 +1,41 @@
+import os
+import sys
+
+from typing import Optional, Dict, Any
+from .gpt_generator import GPTGenerator
+from .utils import COT_PROMPT
+
+ROOT_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+sys.path.append(ROOT_PATH)
+
+
+def generate_dataset_with_gpt(args: Optional[Dict[str, Any]] = None):
+    # Default Arguments
+    if args is None:
+        args = {
+            "model": "gpt-3.5-turbo-16k",
+            "prompt": COT_PROMPT,
+            "num_text2sql_pair_each_db": 1,
+            "table_file_path": os.path.join(
+                ROOT_PATH, "dbgpt_hub/data/spider/tables.json"
+            ),
+            "output_path": os.path.join(
+                ROOT_PATH, "dbgpt_hub/data/spider/synthetic_data_with_gpt.json"
+            ),
+        }
+    else:
+        args = args
+
+    # Run GPT Generator
+    gpt_generator = GPTGenerator(
+        model=args["model"],
+        prompt=args["prompt"],
+        num_text2sql_pair_each_db=args["num_text2sql_pair_each_db"],
+        table_file_path=args["table_file_path"],
+        output_path=args["output_path"],
+    )
+    gpt_generator.generate_synthetic_dataset()
+
+
+if __name__ == "__main__":
+    generate_dataset_with_gpt()
diff --git a/dbgpt_hub/data_generator/llm_generator.py b/dbgpt_hub/data_generator/llm_generator.py
new file mode 100644
index 0000000..783acd3
--- /dev/null
+++ b/dbgpt_hub/data_generator/llm_generator.py
@@ -0,0 +1,24 @@
+from abc import ABC, abstractmethod
+
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+
+class LLMGenerator(ABC):
+    """An interface for large language model data generator.
+    A LLM data generator can accept prompts and generate synthetic Text2SQL dataset.
+    """
+
+    @abstractmethod
+    def generate_synthetic_dataset(self):
+        """Function for generating synthetic dataset"""
+        pass
+
+    @abstractmethod
+    def _chat_llm(self):
+        """Function for interacting with LLMs"""
+        pass
+
+    @abstractmethod
+    def _writeout_dataset(self):
+        """Function for writing out generated dataset"""
+        pass
diff --git a/dbgpt_hub/data_generator/utils.py b/dbgpt_hub/data_generator/utils.py
new file mode 100644
index 0000000..bfb6697
--- /dev/null
+++ b/dbgpt_hub/data_generator/utils.py
@@ -0,0 +1,35 @@
+COT_PROMPT = """
+Please use the following database information, genarate different difficulty level of natural language questions with their corresponding SQL querires.
+There are three different difficulty levels:
+When generating natual language questions and their corresponding SQL queries, you should consider different SQL operators such as  WHERE, GROUP BY, HAVING, ORDER BY, LIMIT, JOIN, INTERSECT, EXCEPT, UNION, NOT IN, OR, AND, EXISTS, LIKE as well as nested queries.
+Moreover, please make sure that each table in the database appears in at least one query.
+
+There are three different difficulty levels, which are defined as follows:
+Easy: Queries that require basic filtering or aggregation on a single table.
+Medium: Queries that encompass more complex filtering or aggregation and involve joining multiple tables.
+Hard: Queries that entail advanced filtering or aggregation, multiple joins, and the use of subqueries.
+
+Here is the basic information of database: {schema}
+
+Based on the tables, columns, primary keys, foreign keys and different difficulty levels, generate {easy_count} Easy, {medium_count} Medium, and {hard_count} Hard natural language questions with their correlated SQL queries.
+Provide your answer in JSON form. Reply with only the answer in JSON form and include no other commentary:
+RESPONSE FORMAT:
+{few_shots_example}
+
+The "db_id" in the above examples means the name of used database. Do not fill out it with "_database" suffix.
+"""
+
+FEW_SHOTS_EXAMPLE = """
+[
+  {
+    'db_id': 'music_2',
+    'question': 'Who performed the song named "Le Pop"?',
+    'query': 'SELECT T2.firstname, T2.lastname FROM Performance AS T1 JOIN Band AS T2 ON T1.bandmate=T2.id JOIN Songs AS T3 ON T3.SongId=T1.SongId WHERE T3.Title="Le Pop"'
+  },
+  {
+    'db_id': 'insurance_fnol',
+    'question': 'Tell me the types of the policy used by the customer named "Dayana Robel".',
+    'query': 'SELECT DISTINCT t3.policy_type_code FROM customers AS t1 JOIN customers_policies AS t2 ON t1.customer_id=t2.customer_id JOIN available_policies AS t3 ON t2.policy_id=t3.policy_id WHERE t1.customer_name="Dayana Robel"'
+  }
+]
+"""
diff --git a/poetry.lock b/poetry.lock
index a9607d0..90a98ef 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -616,6 +616,17 @@ files = [
 [package.extras]
 graph = ["objgraph (>=1.7.2)"]
 
+[[package]]
+name = "distro"
+version = "1.9.0"
+description = "Distro - an OS platform information API"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2"},
+    {file = "distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed"},
+]
+
 [[package]]
 name = "docker-pycreds"
 version = "0.4.0"
@@ -1946,6 +1957,29 @@ files = [
     {file = "nvidia_nvtx_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:65f4d98982b31b60026e0e6de73fbdfc09d08a96f4656dd3665ca616a11e1e82"},
 ]
 
+[[package]]
+name = "openai"
+version = "1.6.1"
+description = "The official Python library for the openai API"
+optional = false
+python-versions = ">=3.7.1"
+files = [
+    {file = "openai-1.6.1-py3-none-any.whl", hash = "sha256:bc9f774838d67ac29fb24cdeb2d58faf57de8b311085dcd1348f7aa02a96c7ee"},
+    {file = "openai-1.6.1.tar.gz", hash = "sha256:d553ca9dbf9486b08e75b09e8671e4f638462aaadccfced632bf490fc3d75fa2"},
+]
+
+[package.dependencies]
+anyio = ">=3.5.0,<5"
+distro = ">=1.7.0,<2"
+httpx = ">=0.23.0,<1"
+pydantic = ">=1.9.0,<3"
+sniffio = "*"
+tqdm = ">4"
+typing-extensions = ">=4.7,<5"
+
+[package.extras]
+datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"]
+
 [[package]]
 name = "orjson"
 version = "3.9.10"
diff --git a/pyproject.toml b/pyproject.toml
index 44fd75f..e07d0f1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "dbgpt_hub"
-version = "0.3.0"
+version = "0.3.1"
 description = "DB-GPT-Hub: Text-to-SQL parsing with LLMs"
 authors = ["Your Name <you@example.com>"]
 license = "MIT"
@@ -60,6 +60,7 @@ nvidia-nvjitlink-cu12 = "^12.3.52"
 prettytable = "^3.9.0"
 docopt = "^0.6.2"
 bitsandbytes = "0.41.3.post2"
+openai = "^1.6.1"
 
 [build-system]
 requires = ["poetry-core"]