diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml index 87bef368189b..39d1992d7623 100644 --- a/.github/ISSUE_TEMPLATE/bug-report.yml +++ b/.github/ISSUE_TEMPLATE/bug-report.yml @@ -1,45 +1,48 @@ name: Bug Report description: "Something is broken inside of ComfyUI. (Do not use this if you're just having issues and need help, or if the issue relates to a custom node)" -labels: [ "Potential Bug" ] +labels: ["Potential Bug"] body: - - type: markdown - attributes: - value: | - Before submitting a **Bug Report**, please ensure the following: + - type: markdown + attributes: + value: | + Before submitting a **Bug Report**, please ensure the following: - **1:** You are running the latest version of ComfyUI. - **2:** You have looked at the existing bug reports and made sure this isn't already reported. - **3:** This is an actual bug in ComfyUI, not just a support question and not caused by an custom node. A bug is when you can specify exact steps to replicate what went wrong and others will be able to repeat your steps and see the same issue happen. + - **1:** You are running the latest version of ComfyUI. + - **2:** You have looked at the existing bug reports and made sure this isn't already reported. + - **3:** You confirmed that the bug is not caused by a custom node. You can disable all custom nodes by passing + `--disable-all-custom-nodes` command line argument. + - **4:** This is an actual bug in ComfyUI, not just a support question. A bug is when you can specify exact + steps to replicate what went wrong and others will be able to repeat your steps and see the same issue happen. - If unsure, ask on the [ComfyUI Matrix Space](https://app.element.io/#/room/%23comfyui_space%3Amatrix.org) or the [Comfy Org Discord](https://discord.gg/comfyorg) first. - - type: textarea - attributes: - label: Expected Behavior - description: "What you expected to happen." - validations: - required: true - - type: textarea - attributes: - label: Actual Behavior - description: "What actually happened. Please include a screenshot of the issue if possible." - validations: - required: true - - type: textarea - attributes: - label: Steps to Reproduce - description: "Describe how to reproduce the issue. Please be sure to attach a workflow JSON or PNG, ideally one that doesn't require custom nodes to test. If the bug open happens when certain custom nodes are used, most likely that custom node is what has the bug rather than ComfyUI, in which case it should be reported to the node's author." - validations: - required: true - - type: textarea - attributes: - label: Debug Logs - description: "Please copy the output from your terminal logs here." - render: powershell - validations: - required: true - - type: textarea - attributes: - label: Other - description: "Any other additional information you think might be helpful." - validations: - required: false + If unsure, ask on the [ComfyUI Matrix Space](https://app.element.io/#/room/%23comfyui_space%3Amatrix.org) or the [Comfy Org Discord](https://discord.gg/comfyorg) first. + - type: textarea + attributes: + label: Expected Behavior + description: "What you expected to happen." + validations: + required: true + - type: textarea + attributes: + label: Actual Behavior + description: "What actually happened. Please include a screenshot of the issue if possible." + validations: + required: true + - type: textarea + attributes: + label: Steps to Reproduce + description: "Describe how to reproduce the issue. Please be sure to attach a workflow JSON or PNG, ideally one that doesn't require custom nodes to test. If the bug open happens when certain custom nodes are used, most likely that custom node is what has the bug rather than ComfyUI, in which case it should be reported to the node's author." + validations: + required: true + - type: textarea + attributes: + label: Debug Logs + description: "Please copy the output from your terminal logs here." + render: powershell + validations: + required: true + - type: textarea + attributes: + label: Other + description: "Any other additional information you think might be helpful." + validations: + required: false diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml new file mode 100644 index 000000000000..5effbea35fc7 --- /dev/null +++ b/.github/workflows/pylint.yml @@ -0,0 +1,23 @@ +name: Python Linting + +on: [push, pull_request] + +jobs: + pylint: + name: Run Pylint + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: 3.x + + - name: Install Pylint + run: pip install pylint + + - name: Run Pylint + run: pylint --rcfile=.pylintrc $(find . -type f -name "*.py") diff --git a/.github/workflows/stable-release.yml b/.github/workflows/stable-release.yml new file mode 100644 index 000000000000..1fd76b53038b --- /dev/null +++ b/.github/workflows/stable-release.yml @@ -0,0 +1,109 @@ + +name: "Release Stable Version" + +on: + push: + tags: + - 'v*' + +jobs: + package_comfy_windows: + permissions: + contents: "write" + packages: "write" + pull-requests: "read" + runs-on: windows-latest + strategy: + matrix: + python_version: [3.11.8] + cuda_version: [121] + steps: + - name: Calculate Minor Version + shell: bash + run: | + # Extract the minor version from the Python version + MINOR_VERSION=$(echo "${{ matrix.python_version }}" | cut -d'.' -f2) + echo "MINOR_VERSION=$MINOR_VERSION" >> $GITHUB_ENV + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python_version }} + + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + persist-credentials: false + - shell: bash + run: | + echo "@echo off + call update_comfyui.bat nopause + echo - + echo This will try to update pytorch and all python dependencies. + echo - + echo If you just want to update normally, close this and run update_comfyui.bat instead. + echo - + pause + ..\python_embeded\python.exe -s -m pip install --upgrade torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu${{ matrix.cuda_version }} -r ../ComfyUI/requirements.txt pygit2 + pause" > update_comfyui_and_python_dependencies.bat + + python -m pip wheel --no-cache-dir torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu${{ matrix.cuda_version }} -r requirements.txt pygit2 -w ./temp_wheel_dir + python -m pip install --no-cache-dir ./temp_wheel_dir/* + echo installed basic + ls -lah temp_wheel_dir + mv temp_wheel_dir cu${{ matrix.cuda_version }}_python_deps + mv cu${{ matrix.cuda_version }}_python_deps ../ + mv update_comfyui_and_python_dependencies.bat ../ + cd .. + pwd + ls + + cp -r ComfyUI ComfyUI_copy + curl https://www.python.org/ftp/python/${{ matrix.python_version }}/python-${{ matrix.python_version }}-embed-amd64.zip -o python_embeded.zip + unzip python_embeded.zip -d python_embeded + cd python_embeded + echo ${{ env.MINOR_VERSION }} + echo 'import site' >> ./python3${{ env.MINOR_VERSION }}._pth + curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py + ./python.exe get-pip.py + ./python.exe --version + echo "Pip version:" + ./python.exe -m pip --version + + set PATH=$PWD/Scripts:$PATH + echo $PATH + ./python.exe -s -m pip install ../cu${{ matrix.cuda_version }}_python_deps/* + sed -i '1i../ComfyUI' ./python3${{ env.MINOR_VERSION }}._pth + cd .. + + git clone https://github.com/comfyanonymous/taesd + cp taesd/*.pth ./ComfyUI_copy/models/vae_approx/ + + mkdir ComfyUI_windows_portable + mv python_embeded ComfyUI_windows_portable + mv ComfyUI_copy ComfyUI_windows_portable/ComfyUI + + cd ComfyUI_windows_portable + + mkdir update + cp -r ComfyUI/.ci/update_windows/* ./update/ + cp -r ComfyUI/.ci/windows_base_files/* ./ + cp ../update_comfyui_and_python_dependencies.bat ./update/ + + cd .. + + "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=8 -mfb=64 -md=32m -ms=on -mf=BCJ2 ComfyUI_windows_portable.7z ComfyUI_windows_portable + mv ComfyUI_windows_portable.7z ComfyUI/ComfyUI_windows_portable_nvidia.7z + + cd ComfyUI_windows_portable + python_embeded/python.exe -s ComfyUI/main.py --quick-test-for-ci --cpu + + ls + + - name: Upload binaries to release + uses: svenstaro/upload-release-action@v2 + with: + repo_token: ${{ secrets.GITHUB_TOKEN }} + file: ComfyUI_windows_portable_nvidia.7z + tag: ${{ github.ref }} + overwrite: true + diff --git a/.github/workflows/test-browser.yml b/.github/workflows/test-browser.yml index 65483b00f28d..7beb0c696469 100644 --- a/.github/workflows/test-browser.yml +++ b/.github/workflows/test-browser.yml @@ -42,7 +42,7 @@ jobs: working-directory: ComfyUI - name: Start ComfyUI server run: | - python main.py --cpu & + python main.py --cpu 2>&1 | tee console_output.log & wait-for-it --service 127.0.0.1:8188 -t 600 working-directory: ComfyUI - name: Install ComfyUI_frontend dependencies @@ -55,9 +55,22 @@ jobs: - name: Run Playwright tests run: npx playwright test working-directory: ComfyUI_frontend + - name: Check for unhandled exceptions in server log + run: | + if grep -qE "Exception|Error" console_output.log; then + echo "Unhandled exception/error found in server log." + exit 1 + fi + working-directory: ComfyUI - uses: actions/upload-artifact@v4 if: always() with: name: playwright-report path: ComfyUI_frontend/playwright-report/ retention-days: 30 + - uses: actions/upload-artifact@v4 + if: always() + with: + name: console-output + path: ComfyUI/console_output.log + retention-days: 30 diff --git a/.github/workflows/test-ui.yaml b/.github/workflows/test-ui.yaml index 4b8b9793479c..d947e9d5fc63 100644 --- a/.github/workflows/test-ui.yaml +++ b/.github/workflows/test-ui.yaml @@ -24,3 +24,7 @@ jobs: npm run test:generate npm test -- --verbose working-directory: ./tests-ui + - name: Run Unit Tests + run: | + pip install -r tests-unit/requirements.txt + python -m pytest tests-unit diff --git a/.github/workflows/windows_release_dependencies.yml b/.github/workflows/windows_release_dependencies.yml index ffd3e2216e57..5aa57e7d7614 100644 --- a/.github/workflows/windows_release_dependencies.yml +++ b/.github/workflows/windows_release_dependencies.yml @@ -33,8 +33,8 @@ jobs: build_dependencies: runs-on: windows-latest steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 with: python-version: 3.${{ inputs.python_minor }}.${{ inputs.python_patch }} @@ -58,7 +58,7 @@ jobs: mv temp_wheel_dir cu${{ inputs.cu }}_python_deps tar cf cu${{ inputs.cu }}_python_deps.tar cu${{ inputs.cu }}_python_deps - - uses: actions/cache/save@v3 + - uses: actions/cache/save@v4 with: path: | cu${{ inputs.cu }}_python_deps.tar diff --git a/.github/workflows/windows_release_nightly_pytorch.yml b/.github/workflows/windows_release_nightly_pytorch.yml index fa24a985c7fa..e68011b64e4d 100644 --- a/.github/workflows/windows_release_nightly_pytorch.yml +++ b/.github/workflows/windows_release_nightly_pytorch.yml @@ -32,11 +32,11 @@ jobs: pull-requests: "read" runs-on: windows-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: fetch-depth: 0 persist-credentials: false - - uses: actions/setup-python@v4 + - uses: actions/setup-python@v5 with: python-version: 3.${{ inputs.python_minor }}.${{ inputs.python_patch }} - shell: bash @@ -73,7 +73,7 @@ jobs: pause" > ./update/update_comfyui_and_python_dependencies.bat cd .. - "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma -mx=8 -mfb=64 -md=32m -ms=on -mf=BCJ2 ComfyUI_windows_portable_nightly_pytorch.7z ComfyUI_windows_portable_nightly_pytorch + "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=8 -mfb=64 -md=32m -ms=on -mf=BCJ2 ComfyUI_windows_portable_nightly_pytorch.7z ComfyUI_windows_portable_nightly_pytorch mv ComfyUI_windows_portable_nightly_pytorch.7z ComfyUI/ComfyUI_windows_portable_nvidia_or_cpu_nightly_pytorch.7z cd ComfyUI_windows_portable_nightly_pytorch diff --git a/.github/workflows/windows_release_package.yml b/.github/workflows/windows_release_package.yml index 4e3cdabd2d57..020741c411f9 100644 --- a/.github/workflows/windows_release_package.yml +++ b/.github/workflows/windows_release_package.yml @@ -32,7 +32,7 @@ jobs: pull-requests: "read" runs-on: windows-latest steps: - - uses: actions/cache/restore@v3 + - uses: actions/cache/restore@v4 id: cache with: path: | @@ -48,7 +48,7 @@ jobs: pwd ls - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: fetch-depth: 0 persist-credentials: false @@ -82,7 +82,7 @@ jobs: cd .. - "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma -mx=8 -mfb=64 -md=32m -ms=on -mf=BCJ2 ComfyUI_windows_portable.7z ComfyUI_windows_portable + "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=8 -mfb=64 -md=32m -ms=on -mf=BCJ2 ComfyUI_windows_portable.7z ComfyUI_windows_portable mv ComfyUI_windows_portable.7z ComfyUI/new_ComfyUI_windows_portable_nvidia_cu${{ inputs.cu }}_or_cpu.7z cd ComfyUI_windows_portable diff --git a/.gitignore b/.gitignore index a9beebe73a73..598e968c8afb 100644 --- a/.gitignore +++ b/.gitignore @@ -5,9 +5,7 @@ __pycache__/ !/input/example.png /models/ /temp/ -/custom_nodes/ !custom_nodes/example_node.py.example -extra_model_paths.yaml /.vs .vscode/ .idea/ @@ -17,4 +15,5 @@ venv/ !/web/extensions/core/ /tests-ui/data/object_info.json /user/ -*.log \ No newline at end of file +*.log +web_custom_versions/ \ No newline at end of file diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 000000000000..a5da56e57ca4 --- /dev/null +++ b/.pylintrc @@ -0,0 +1,3 @@ +[MESSAGES CONTROL] +disable=all +enable=eval-used diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 000000000000..048f127e72dd --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,41 @@ +# Contributing to ComfyUI + +Welcome, and thank you for your interest in contributing to ComfyUI! + +There are several ways in which you can contribute, beyond writing code. The goal of this document is to provide a high-level overview of how you can get involved. + +## Asking Questions + +Have a question? Instead of opening an issue, please ask on [Discord](https://comfy.org/discord) or [Matrix](https://app.element.io/#/room/%23comfyui_space%3Amatrix.org) channels. Our team and the community will help you. + +## Providing Feedback + +Your comments and feedback are welcome, and the development team is available via a handful of different channels. + +See the `#bug-report`, `#feature-request` and `#feedback` channels on Discord. + +## Reporting Issues + +Have you identified a reproducible problem in ComfyUI? Do you have a feature request? We want to hear about it! Here's how you can report your issue as effectively as possible. + + +### Look For an Existing Issue + +Before you create a new issue, please do a search in [open issues](https://github.com/comfyanonymous/ComfyUI/issues) to see if the issue or feature request has already been filed. + +If you find your issue already exists, make relevant comments and add your [reaction](https://github.com/blog/2119-add-reactions-to-pull-requests-issues-and-comments). Use a reaction in place of a "+1" comment: + +* 👍 - upvote +* 👎 - downvote + +If you cannot find an existing issue that describes your bug or feature, create a new issue. We have an issue template in place to organize new issues. + + +### Creating Pull Requests + +* Please refer to the article on [creating pull requests](https://github.com/comfyanonymous/ComfyUI/wiki/How-to-Contribute-Code) and contributing to this project. + + +## Thank You + +Your contributions to open source, large or small, make great projects like this possible. Thank you for taking the time to contribute. diff --git a/Notebok_workflow_latent_safety.ipynb b/Notebok_workflow_latent_safety.ipynb new file mode 100644 index 000000000000..b92f584be517 --- /dev/null +++ b/Notebok_workflow_latent_safety.ipynb @@ -0,0 +1,524 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "2f86b673-a26f-48fd-a04f-c0ff2bc385b7", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Total VRAM 40337 MB, total RAM 515700 MB\n", + "Set vram state to: NORMAL_VRAM\n", + "Device: cuda:0 NVIDIA A100-SXM4-40GB : native\n", + "VAE dtype: torch.bfloat16\n", + "Using pytorch cross attention\n" + ] + } + ], + "source": [ + "import os\n", + "import nodes\n", + "import torch\n", + "import random\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "from comfy import sd\n", + "from PIL import Image\n", + "\n", + "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "b470182c-87d2-44b4-bfe7-0f76ca96b2f9", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "model_type EPS\n", + "Using pytorch attention in VAE\n", + "Using pytorch attention in VAE\n", + "clip missing: ['clip_l.logit_scale', 'clip_l.transformer.text_projection.weight']\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(, , , None)\n", + "None\n", + "(, , , None)\n", + "\n" + ] + } + ], + "source": [ + "#CheckpointLoader\n", + "\n", + "ckpt_path = \"/dlabscratch1/wendler/models/sdxl-turbo/sd_xl_turbo_1.0_fp16.safetensors\"#folder_paths.get_full_path(\"checkpoints\", ckpt_name)\n", + "out = sd.load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, output_clipvision=True, embedding_directory=\"./embeddings\")\n", + "(model, clip, vae, clip_vision) = out\n", + "\n", + "print(out)\n", + "print(clip_vision)\n", + "print(out)\n", + "print(model)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "05848252-f0cf-424d-b71f-32c055eab33f", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Requested to load SDXLClipModel\n", + "Loading 1 new model\n" + ] + } + ], + "source": [ + "# CLIP Text Encode\n", + "\n", + "text = \"dog\"\n", + "tokens = clip.tokenize(text)\n", + "cond, pooled = clip.encode_from_tokens(tokens, return_pooled=True)\n", + "positive = [[cond, {\"pooled_output\": pooled}]]\n", + "\n", + "text = \"text, watermark\"\n", + "tokens = clip.tokenize(text)\n", + "cond, pooled = clip.encode_from_tokens(tokens, return_pooled=True)\n", + "negative = [[cond, {\"pooled_output\": pooled}]]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "86b23ca2-adff-4b96-97e2-6be243988ce7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([1, 1280])\n", + "torch.Size([1, 77, 2048])\n" + ] + } + ], + "source": [ + "print(pooled.shape)\n", + "print(cond.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "4e48771d-b95c-4f7c-b78e-6ecf336981cf", + "metadata": {}, + "outputs": [], + "source": [ + "# Empty Latent Image\n", + "\n", + "batch_size = 2\n", + "height, width = 512, 512\n", + "\n", + "latent = torch.zeros([batch_size, 4, height // 8, width // 8], device=device)\n", + "latent_image = {\"samples\":latent}" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "8c7fba7c-2d8c-4415-b05b-3af4104f577f", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Requested to load SDXL\n", + "Loading 1 new model\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "06efdf75794b48888eb0665a7c3baa7c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/30 [00:00)\n", + "torch.Size([2, 640])\n", + "Similarity (text, watermark):\t[[0.16905537 0.1504263 ]\n", + " [0.15587994 0.13815394]]\n" + ] + } + ], + "source": [ + "image_features = model_latent.encode_image(samples[\"samples\"])\n", + "\n", + "sensitive_word = \"cat\"\n", + "text_features = model_latent.encode_text(tokenizer_latent([f\"an image of {sensitive_word}\", f\"an image of no {sensitive_word}\"]))\n", + "print(image_features.shape)\n", + "image_features /= image_features.norm(dim=-1, keepdim=True)\n", + "text_features /= text_features.norm(dim=-1, keepdim=True)\n", + "\n", + "text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)\n", + "\n", + "print(text_probs[0][0].item())\n", + "print(text_probs)\n", + "#text_features = model_latent.encode_text(captions.cuda())\n", + "print(image_features.shape)\n", + "image_features_np = image_features.detach().numpy()\n", + "text_features_np = text_features.detach().numpy()\n", + "\n", + "similarity_score = cosine_similarity(image_features_np, text_features_np)\n", + "print(f\"Similarity ({text}):\\t{similarity_score}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "34a80f73-1a07-42f7-826d-1497ddf28d19", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Requested to load AutoencoderKL\n", + "Loading 1 new model\n" + ] + } + ], + "source": [ + "# VAE Decode\n", + "\n", + "images = vae.decode(samples[\"samples\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "6618356a-7836-4855-99c2-5f4d3f585afa", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Save Image\n", + "\n", + "#filename_prefix += self.prefix_append\n", + "#full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path(filename_prefix, self.output_dir, images[0].shape[1], images[0].shape[0])\n", + "results = list()\n", + "for (batch_number, image) in enumerate(images):\n", + " i = 255. * image.detach().numpy()\n", + " img = Image.fromarray(np.clip(i, 0, 255).astype(np.uint8))\n", + " metadata = None\n", + " \"\"\"\n", + " if not args.disable_metadata:\n", + " metadata = PngInfo()\n", + " if prompt is not None:\n", + " metadata.add_text(\"prompt\", json.dumps(prompt))\n", + " if extra_pnginfo is not None:\n", + " for x in extra_pnginfo:\n", + " metadata.add_text(x, json.dumps(extra_pnginfo[x]))\n", + " filename_with_batch_num = filename.replace(\"%batch_num%\", str(batch_number))\n", + " file = f\"{filename_with_batch_num}_{counter:05}_.png\"\n", + " img.save(os.path.join(full_output_folder, file), pnginfo=metadata, compress_level=self.compress_level)\n", + " results.append({\n", + " \"filename\": file,\n", + " \"subfolder\": subfolder,\n", + " \"type\": self.type\n", + " })\n", + " counter += 1\n", + " \"\"\"\n", + " plt.imshow(img)\n", + " plt.axis('off') # Turn off axis\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "01cda7fe-7b66-4810-9d9c-baccec11d497", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Similarity (cat):\t0.12320650368928909\n", + "Similarity (no cat):\t0.1302909255027771\n", + "tensor([[1.]], device='cuda:0', grad_fn=)\n", + "tensor([[12.3207]], device='cuda:0', grad_fn=)\n" + ] + } + ], + "source": [ + "# Safety Filter\n", + "\n", + "import requests\n", + "from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer\n", + "from sklearn.metrics.pairwise import cosine_similarity\n", + "\n", + "def get_model_info(model_ID, device):\n", + "\tmodel = CLIPModel.from_pretrained(model_ID).to(device)\n", + "\tprocessor = CLIPProcessor.from_pretrained(model_ID)\n", + "\ttokenizer = CLIPTokenizer.from_pretrained(model_ID)\n", + "\treturn model, processor, tokenizer\n", + "\n", + "def get_single_text_embedding(text): \n", + " inputs = tokenizer(text, return_tensors = \"pt\").to(device)\n", + " text_embeddings = model_clip.get_text_features(**inputs)\n", + " embedding_as_np = text_embeddings.cpu().detach().numpy()\n", + " return embedding_as_np\n", + "\n", + "def get_single_image_embedding(my_image):\n", + " image = processor(\n", + " \t\ttext = None,\n", + " \t\timages = my_image,\n", + " \t\treturn_tensors=\"pt\"\n", + " \t\t)[\"pixel_values\"].to(device)\n", + " embedding = model_clip.get_image_features(image)\n", + " embedding_as_np = embedding.cpu().detach().numpy()\n", + " return embedding_as_np\n", + " \n", + "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", + "\n", + "model_IDs = [\"openai/clip-vit-base-patch32\", \"openai/clip-vit-large-patch14\"]\n", + "model_ID = model_IDs[1]\n", + "model_clip, processor, tokenizer = get_model_info(model_ID, device)\n", + "\n", + "#url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n", + "image = img #Image.open(requests.get(url, stream=True).raw)\n", + "image_embedding = get_single_image_embedding(image)\n", + "\n", + "texts=[\"cat\", \"no cat\"]\n", + "\n", + "for text in texts:\n", + " text_embedding = get_single_text_embedding(text)\n", + " similarity_score = cosine_similarity(image_embedding, text_embedding)[0][0]\n", + " print(f\"Similarity ({text}):\\t{similarity_score}\")\n", + "\n", + "inputs = processor(text=\"cat\", images=image, return_tensors=\"pt\", padding=True).to(device)\n", + "outputs = model_clip(**inputs)\n", + "\n", + "logits_per_image = outputs.logits_per_image # this is the image-text similarity score\n", + "probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities\n", + "\n", + "print(probs)\n", + "print(logits_per_image)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "8eecba77-0617-4e9b-8f65-7e16b78782b1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "12.32065200805664" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "logits_per_image.item()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "ca4d6d49-bfeb-4275-a573-7eccc8ac6bf0", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/dlabscratch1/jabecker/conda/envs/test_env/lib/python3.11/site-packages/diffusers/utils/outputs.py:63: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n", + " torch.utils._pytree._register_pytree_node(\n", + "/dlabscratch1/jabecker/conda/envs/test_env/lib/python3.11/site-packages/diffusers/pipelines/pipeline_utils.py:761: FutureWarning: `torch_dtype` is deprecated and will be removed in version 0.25.0. \n", + " deprecate(\"torch_dtype\", \"0.25.0\", \"\")\n" + ] + }, + { + "ename": "OutOfMemoryError", + "evalue": "CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 39.39 GiB of which 81.50 MiB is free. Process 2457010 has 19.18 GiB memory in use. Process 2464208 has 20.13 GiB memory in use. Of the allocated memory 19.37 GiB is allocated by PyTorch, and 246.71 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mOutOfMemoryError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[13], line 7\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\n\u001b[1;32m 4\u001b[0m pipeline \u001b[38;5;241m=\u001b[39m StableDiffusionXLPipeline\u001b[38;5;241m.\u001b[39mfrom_single_file(\n\u001b[1;32m 5\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/dlabscratch1/wendler/models/sdxl-turbo/sd_xl_turbo_1.0.safetensors\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 6\u001b[0m torch_dtype\u001b[38;5;241m=\u001b[39mtorch\u001b[38;5;241m.\u001b[39mfloat16, variant\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfp16\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m----> 7\u001b[0m pipeline \u001b[38;5;241m=\u001b[39m pipeline\u001b[38;5;241m.\u001b[39mto(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcuda\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 8\u001b[0m pipeline\u001b[38;5;241m.\u001b[39mscheduler \u001b[38;5;241m=\u001b[39m EulerAncestralDiscreteScheduler\u001b[38;5;241m.\u001b[39mfrom_config(pipeline\u001b[38;5;241m.\u001b[39mscheduler\u001b[38;5;241m.\u001b[39mconfig, timestep_spacing\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtrailing\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m/dlabscratch1/jabecker/conda/envs/test_env/lib/python3.11/site-packages/diffusers/pipelines/pipeline_utils.py:864\u001b[0m, in \u001b[0;36mDiffusionPipeline.to\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 860\u001b[0m logger\u001b[38;5;241m.\u001b[39mwarning(\n\u001b[1;32m 861\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe module \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmodule\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m has been loaded in 8bit and moving it to \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtorch_dtype\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m via `.to()` is not yet supported. Module is still on \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmodule\u001b[38;5;241m.\u001b[39mdevice\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 862\u001b[0m )\n\u001b[1;32m 863\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 864\u001b[0m module\u001b[38;5;241m.\u001b[39mto(device, dtype)\n\u001b[1;32m 866\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m 867\u001b[0m module\u001b[38;5;241m.\u001b[39mdtype \u001b[38;5;241m==\u001b[39m torch\u001b[38;5;241m.\u001b[39mfloat16\n\u001b[1;32m 868\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mstr\u001b[39m(device) \u001b[38;5;129;01min\u001b[39;00m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcpu\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 869\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m silence_dtype_warnings\n\u001b[1;32m 870\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_offloaded\n\u001b[1;32m 871\u001b[0m ):\n\u001b[1;32m 872\u001b[0m logger\u001b[38;5;241m.\u001b[39mwarning(\n\u001b[1;32m 873\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPipelines loaded with `dtype=torch.float16` cannot run with `cpu` device. It\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 874\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m is not recommended to move them to `cpu` as running them will fail. Please make\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 877\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m `torch_dtype=torch.float16` argument, or use another device for inference.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 878\u001b[0m )\n", + "File \u001b[0;32m/dlabscratch1/jabecker/conda/envs/test_env/lib/python3.11/site-packages/torch/nn/modules/module.py:1152\u001b[0m, in \u001b[0;36mModule.to\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1148\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m t\u001b[38;5;241m.\u001b[39mto(device, dtype \u001b[38;5;28;01mif\u001b[39;00m t\u001b[38;5;241m.\u001b[39mis_floating_point() \u001b[38;5;129;01mor\u001b[39;00m t\u001b[38;5;241m.\u001b[39mis_complex() \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 1149\u001b[0m non_blocking, memory_format\u001b[38;5;241m=\u001b[39mconvert_to_format)\n\u001b[1;32m 1150\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m t\u001b[38;5;241m.\u001b[39mto(device, dtype \u001b[38;5;28;01mif\u001b[39;00m t\u001b[38;5;241m.\u001b[39mis_floating_point() \u001b[38;5;129;01mor\u001b[39;00m t\u001b[38;5;241m.\u001b[39mis_complex() \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m, non_blocking)\n\u001b[0;32m-> 1152\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_apply(convert)\n", + "File \u001b[0;32m/dlabscratch1/jabecker/conda/envs/test_env/lib/python3.11/site-packages/torch/nn/modules/module.py:802\u001b[0m, in \u001b[0;36mModule._apply\u001b[0;34m(self, fn, recurse)\u001b[0m\n\u001b[1;32m 800\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m recurse:\n\u001b[1;32m 801\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m module \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mchildren():\n\u001b[0;32m--> 802\u001b[0m module\u001b[38;5;241m.\u001b[39m_apply(fn)\n\u001b[1;32m 804\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcompute_should_use_set_data\u001b[39m(tensor, tensor_applied):\n\u001b[1;32m 805\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m torch\u001b[38;5;241m.\u001b[39m_has_compatible_shallow_copy_type(tensor, tensor_applied):\n\u001b[1;32m 806\u001b[0m \u001b[38;5;66;03m# If the new tensor has compatible tensor type as the existing tensor,\u001b[39;00m\n\u001b[1;32m 807\u001b[0m \u001b[38;5;66;03m# the current behavior is to change the tensor in-place using `.data =`,\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 812\u001b[0m \u001b[38;5;66;03m# global flag to let the user control whether they want the future\u001b[39;00m\n\u001b[1;32m 813\u001b[0m \u001b[38;5;66;03m# behavior of overwriting the existing tensor or not.\u001b[39;00m\n", + "File \u001b[0;32m/dlabscratch1/jabecker/conda/envs/test_env/lib/python3.11/site-packages/torch/nn/modules/module.py:802\u001b[0m, in \u001b[0;36mModule._apply\u001b[0;34m(self, fn, recurse)\u001b[0m\n\u001b[1;32m 800\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m recurse:\n\u001b[1;32m 801\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m module \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mchildren():\n\u001b[0;32m--> 802\u001b[0m module\u001b[38;5;241m.\u001b[39m_apply(fn)\n\u001b[1;32m 804\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcompute_should_use_set_data\u001b[39m(tensor, tensor_applied):\n\u001b[1;32m 805\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m torch\u001b[38;5;241m.\u001b[39m_has_compatible_shallow_copy_type(tensor, tensor_applied):\n\u001b[1;32m 806\u001b[0m \u001b[38;5;66;03m# If the new tensor has compatible tensor type as the existing tensor,\u001b[39;00m\n\u001b[1;32m 807\u001b[0m \u001b[38;5;66;03m# the current behavior is to change the tensor in-place using `.data =`,\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 812\u001b[0m \u001b[38;5;66;03m# global flag to let the user control whether they want the future\u001b[39;00m\n\u001b[1;32m 813\u001b[0m \u001b[38;5;66;03m# behavior of overwriting the existing tensor or not.\u001b[39;00m\n", + " \u001b[0;31m[... skipping similar frames: Module._apply at line 802 (6 times)]\u001b[0m\n", + "File \u001b[0;32m/dlabscratch1/jabecker/conda/envs/test_env/lib/python3.11/site-packages/torch/nn/modules/module.py:802\u001b[0m, in \u001b[0;36mModule._apply\u001b[0;34m(self, fn, recurse)\u001b[0m\n\u001b[1;32m 800\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m recurse:\n\u001b[1;32m 801\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m module \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mchildren():\n\u001b[0;32m--> 802\u001b[0m module\u001b[38;5;241m.\u001b[39m_apply(fn)\n\u001b[1;32m 804\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcompute_should_use_set_data\u001b[39m(tensor, tensor_applied):\n\u001b[1;32m 805\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m torch\u001b[38;5;241m.\u001b[39m_has_compatible_shallow_copy_type(tensor, tensor_applied):\n\u001b[1;32m 806\u001b[0m \u001b[38;5;66;03m# If the new tensor has compatible tensor type as the existing tensor,\u001b[39;00m\n\u001b[1;32m 807\u001b[0m \u001b[38;5;66;03m# the current behavior is to change the tensor in-place using `.data =`,\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 812\u001b[0m \u001b[38;5;66;03m# global flag to let the user control whether they want the future\u001b[39;00m\n\u001b[1;32m 813\u001b[0m \u001b[38;5;66;03m# behavior of overwriting the existing tensor or not.\u001b[39;00m\n", + "File \u001b[0;32m/dlabscratch1/jabecker/conda/envs/test_env/lib/python3.11/site-packages/torch/nn/modules/module.py:825\u001b[0m, in \u001b[0;36mModule._apply\u001b[0;34m(self, fn, recurse)\u001b[0m\n\u001b[1;32m 821\u001b[0m \u001b[38;5;66;03m# Tensors stored in modules are graph leaves, and we don't want to\u001b[39;00m\n\u001b[1;32m 822\u001b[0m \u001b[38;5;66;03m# track autograd history of `param_applied`, so we have to use\u001b[39;00m\n\u001b[1;32m 823\u001b[0m \u001b[38;5;66;03m# `with torch.no_grad():`\u001b[39;00m\n\u001b[1;32m 824\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad():\n\u001b[0;32m--> 825\u001b[0m param_applied \u001b[38;5;241m=\u001b[39m fn(param)\n\u001b[1;32m 826\u001b[0m should_use_set_data \u001b[38;5;241m=\u001b[39m compute_should_use_set_data(param, param_applied)\n\u001b[1;32m 827\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m should_use_set_data:\n", + "File \u001b[0;32m/dlabscratch1/jabecker/conda/envs/test_env/lib/python3.11/site-packages/torch/nn/modules/module.py:1150\u001b[0m, in \u001b[0;36mModule.to..convert\u001b[0;34m(t)\u001b[0m\n\u001b[1;32m 1147\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m convert_to_format \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m t\u001b[38;5;241m.\u001b[39mdim() \u001b[38;5;129;01min\u001b[39;00m (\u001b[38;5;241m4\u001b[39m, \u001b[38;5;241m5\u001b[39m):\n\u001b[1;32m 1148\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m t\u001b[38;5;241m.\u001b[39mto(device, dtype \u001b[38;5;28;01mif\u001b[39;00m t\u001b[38;5;241m.\u001b[39mis_floating_point() \u001b[38;5;129;01mor\u001b[39;00m t\u001b[38;5;241m.\u001b[39mis_complex() \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 1149\u001b[0m non_blocking, memory_format\u001b[38;5;241m=\u001b[39mconvert_to_format)\n\u001b[0;32m-> 1150\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m t\u001b[38;5;241m.\u001b[39mto(device, dtype \u001b[38;5;28;01mif\u001b[39;00m t\u001b[38;5;241m.\u001b[39mis_floating_point() \u001b[38;5;129;01mor\u001b[39;00m t\u001b[38;5;241m.\u001b[39mis_complex() \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m, non_blocking)\n", + "\u001b[0;31mOutOfMemoryError\u001b[0m: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 39.39 GiB of which 81.50 MiB is free. Process 2457010 has 19.18 GiB memory in use. Process 2464208 has 20.13 GiB memory in use. Of the allocated memory 19.37 GiB is allocated by PyTorch, and 246.71 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)" + ] + } + ], + "source": [ + "from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler\n", + "import torch\n", + "\n", + "pipeline = StableDiffusionXLPipeline.from_single_file(\n", + " \"/dlabscratch1/wendler/models/sdxl-turbo/sd_xl_turbo_1.0.safetensors\",\n", + " torch_dtype=torch.float16, variant=\"fp16\")\n", + "pipeline = pipeline.to(\"cuda\")\n", + "pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(pipeline.scheduler.config, timestep_spacing=\"trailing\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e4bae4d-e5a3-49af-ad58-a461d8e9f59c", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Notebok_workflow_safety.ipynb b/Notebok_workflow_safety.ipynb new file mode 100644 index 000000000000..abdf3178f682 --- /dev/null +++ b/Notebok_workflow_safety.ipynb @@ -0,0 +1,391 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "2f86b673-a26f-48fd-a04f-c0ff2bc385b7", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Total VRAM 40337 MB, total RAM 515700 MB\n", + "Set vram state to: NORMAL_VRAM\n", + "Device: cuda:0 NVIDIA A100-SXM4-40GB : native\n", + "VAE dtype: torch.bfloat16\n", + "Using pytorch cross attention\n" + ] + } + ], + "source": [ + "import os\n", + "import nodes\n", + "import torch\n", + "import random\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "from comfy import sd\n", + "from PIL import Image\n", + "\n", + "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "b470182c-87d2-44b4-bfe7-0f76ca96b2f9", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "model_type EPS\n", + "Using pytorch attention in VAE\n", + "Using pytorch attention in VAE\n", + "clip missing: ['clip_l.logit_scale', 'clip_l.transformer.text_projection.weight']\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(, , , None)\n", + "None\n", + "(, , , None)\n", + "\n" + ] + } + ], + "source": [ + "ckpt_path = \"/dlabscratch1/wendler/models/sdxl-turbo/sd_xl_turbo_1.0_fp16.safetensors\"#folder_paths.get_full_path(\"checkpoints\", ckpt_name)\n", + "out = sd.load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, output_clipvision=True, embedding_directory=\"./embeddings\")\n", + "(model, clip, vae, clip_vision) = out\n", + "\n", + "print(out)\n", + "print(clip_vision)\n", + "print(out)\n", + "print(model)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "05848252-f0cf-424d-b71f-32c055eab33f", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Requested to load SDXLClipModel\n", + "Loading 1 new model\n" + ] + } + ], + "source": [ + "text = \"cat\"\n", + "tokens = clip.tokenize(text)\n", + "cond, pooled = clip.encode_from_tokens(tokens, return_pooled=True)\n", + "positive = [[cond, {\"pooled_output\": pooled}]]\n", + "\n", + "text = \"text, watermark\"\n", + "tokens = clip.tokenize(text)\n", + "cond, pooled = clip.encode_from_tokens(tokens, return_pooled=True)\n", + "negative = [[cond, {\"pooled_output\": pooled}]]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "86b23ca2-adff-4b96-97e2-6be243988ce7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([1, 1280])\n", + "torch.Size([1, 77, 2048])\n" + ] + } + ], + "source": [ + "print(pooled.shape)\n", + "print(cond.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "4e48771d-b95c-4f7c-b78e-6ecf336981cf", + "metadata": {}, + "outputs": [], + "source": [ + "batch_size = 1\n", + "height, width = 512, 512\n", + "\n", + "latent = torch.zeros([batch_size, 4, height // 8, width // 8], device=device)\n", + "latent_image = {\"samples\":latent}" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "8c7fba7c-2d8c-4415-b05b-3af4104f577f", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Requested to load SDXL\n", + "Loading 1 new model\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "bb36109d812e403eb9642ed558c7698c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/30 [00:00" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#filename_prefix += self.prefix_append\n", + "#full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path(filename_prefix, self.output_dir, images[0].shape[1], images[0].shape[0])\n", + "results = list()\n", + "for (batch_number, image) in enumerate(images):\n", + " i = 255. * image.detach().numpy()\n", + " img = Image.fromarray(np.clip(i, 0, 255).astype(np.uint8))\n", + " metadata = None\n", + " \"\"\"\n", + " if not args.disable_metadata:\n", + " metadata = PngInfo()\n", + " if prompt is not None:\n", + " metadata.add_text(\"prompt\", json.dumps(prompt))\n", + " if extra_pnginfo is not None:\n", + " for x in extra_pnginfo:\n", + " metadata.add_text(x, json.dumps(extra_pnginfo[x]))\n", + " filename_with_batch_num = filename.replace(\"%batch_num%\", str(batch_number))\n", + " file = f\"{filename_with_batch_num}_{counter:05}_.png\"\n", + " img.save(os.path.join(full_output_folder, file), pnginfo=metadata, compress_level=self.compress_level)\n", + " results.append({\n", + " \"filename\": file,\n", + " \"subfolder\": subfolder,\n", + " \"type\": self.type\n", + " })\n", + " counter += 1\n", + " \"\"\"\n", + " plt.imshow(img)\n", + " plt.axis('off') # Turn off axis\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "01cda7fe-7b66-4810-9d9c-baccec11d497", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Similarity (cat):\t0.19083161652088165\n", + "Similarity (no cat):\t0.176759272813797\n", + "tensor([[1.]], device='cuda:0', grad_fn=)\n", + "tensor([[19.0832]], device='cuda:0', grad_fn=)\n" + ] + } + ], + "source": [ + "import requests\n", + "from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer\n", + "from sklearn.metrics.pairwise import cosine_similarity\n", + "\n", + "def get_model_info(model_ID, device):\n", + "\tmodel = CLIPModel.from_pretrained(model_ID).to(device)\n", + "\tprocessor = CLIPProcessor.from_pretrained(model_ID)\n", + "\ttokenizer = CLIPTokenizer.from_pretrained(model_ID)\n", + "\treturn model, processor, tokenizer\n", + "\n", + "def get_single_text_embedding(text): \n", + " inputs = tokenizer(text, return_tensors = \"pt\").to(device)\n", + " text_embeddings = model_clip.get_text_features(**inputs)\n", + " embedding_as_np = text_embeddings.cpu().detach().numpy()\n", + " return embedding_as_np\n", + "\n", + "def get_single_image_embedding(my_image):\n", + " image = processor(\n", + " \t\ttext = None,\n", + " \t\timages = my_image,\n", + " \t\treturn_tensors=\"pt\"\n", + " \t\t)[\"pixel_values\"].to(device)\n", + " embedding = model_clip.get_image_features(image)\n", + " embedding_as_np = embedding.cpu().detach().numpy()\n", + " return embedding_as_np\n", + " \n", + "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", + "\n", + "model_IDs = [\"openai/clip-vit-base-patch32\", \"openai/clip-vit-large-patch14\"]\n", + "model_ID = model_IDs[1]\n", + "model_clip, processor, tokenizer = get_model_info(model_ID, device)\n", + "\n", + "#url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n", + "image = img #Image.open(requests.get(url, stream=True).raw)\n", + "image_embedding = get_single_image_embedding(image)\n", + "\n", + "texts=[\"cat\", \"no cat\"]\n", + "\n", + "for text in texts:\n", + " text_embedding = get_single_text_embedding(text)\n", + " similarity_score = cosine_similarity(image_embedding, text_embedding)[0][0]\n", + " print(f\"Similarity ({text}):\\t{similarity_score}\")\n", + "\n", + "\n", + "inputs = processor(text=\"cat\", images=image, return_tensors=\"pt\", padding=True).to(device)\n", + "outputs = model_clip(**inputs)\n", + "\n", + "logits_per_image = outputs.logits_per_image # this is the image-text similarity score\n", + "probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities\n", + "\n", + "print(probs)\n", + "print(logits_per_image)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "8eecba77-0617-4e9b-8f65-7e16b78782b1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "19.083162307739258" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "logits_per_image.item()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "ca4d6d49-bfeb-4275-a573-7eccc8ac6bf0", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/dlabscratch1/jabecker/conda/envs/test_env/lib/python3.11/site-packages/diffusers/utils/outputs.py:63: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n", + " torch.utils._pytree._register_pytree_node(\n", + "/dlabscratch1/jabecker/conda/envs/test_env/lib/python3.11/site-packages/diffusers/pipelines/pipeline_utils.py:761: FutureWarning: `torch_dtype` is deprecated and will be removed in version 0.25.0. \n", + " deprecate(\"torch_dtype\", \"0.25.0\", \"\")\n" + ] + } + ], + "source": [ + "from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler\n", + "import torch\n", + "\n", + "pipeline = StableDiffusionXLPipeline.from_single_file(\n", + " \"/dlabscratch1/wendler/models/sdxl-turbo/sd_xl_turbo_1.0.safetensors\",\n", + " torch_dtype=torch.float16, variant=\"fp16\")\n", + "pipeline = pipeline.to(\"cuda\")\n", + "pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(pipeline.scheduler.config, timestep_spacing=\"trailing\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e4bae4d-e5a3-49af-ad58-a461d8e9f59c", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/README.md b/README.md index a40dd07dd472..5f747321fa02 100644 --- a/README.md +++ b/README.md @@ -11,16 +11,16 @@ This ui will let you design and execute advanced stable diffusion pipelines usin ## Features - Nodes/graph/flowchart interface to experiment and create complex Stable Diffusion workflows without needing to code anything. -- Fully supports SD1.x, SD2.x, [SDXL](https://comfyanonymous.github.io/ComfyUI_examples/sdxl/), [Stable Video Diffusion](https://comfyanonymous.github.io/ComfyUI_examples/video/), [Stable Cascade](https://comfyanonymous.github.io/ComfyUI_examples/stable_cascade/) and [SD3](https://comfyanonymous.github.io/ComfyUI_examples/sd3/) +- Fully supports SD1.x, SD2.x, [SDXL](https://comfyanonymous.github.io/ComfyUI_examples/sdxl/), [Stable Video Diffusion](https://comfyanonymous.github.io/ComfyUI_examples/video/), [Stable Cascade](https://comfyanonymous.github.io/ComfyUI_examples/stable_cascade/), [SD3](https://comfyanonymous.github.io/ComfyUI_examples/sd3/) and [Stable Audio](https://comfyanonymous.github.io/ComfyUI_examples/audio/) - Asynchronous Queue system - Many optimizations: Only re-executes the parts of the workflow that changes between executions. -- Command line option: ```--lowvram``` to make it work on GPUs with less than 3GB vram (enabled automatically on GPUs with low vram) +- Smart memory management: can automatically run models on GPUs with as low as 1GB vram. - Works even if you don't have a GPU with: ```--cpu``` (slow) - Can load ckpt, safetensors and diffusers models/checkpoints. Standalone VAEs and CLIP models. - Embeddings/Textual inversion - [Loras (regular, locon and loha)](https://comfyanonymous.github.io/ComfyUI_examples/lora/) - [Hypernetworks](https://comfyanonymous.github.io/ComfyUI_examples/hypernetworks/) -- Loading full workflows (with seeds) from generated PNG files. +- Loading full workflows (with seeds) from generated PNG, WebP and FLAC files. - Saving/Loading workflows as Json files. - Nodes interface can be used to create complex workflows like one for [Hires fix](https://comfyanonymous.github.io/ComfyUI_examples/2_pass_txt2img/) or much more advanced ones. - [Area Composition](https://comfyanonymous.github.io/ComfyUI_examples/area_composition/) @@ -32,6 +32,7 @@ This ui will let you design and execute advanced stable diffusion pipelines usin - [Model Merging](https://comfyanonymous.github.io/ComfyUI_examples/model_merging/) - [LCM models and Loras](https://comfyanonymous.github.io/ComfyUI_examples/lcm/) - [SDXL Turbo](https://comfyanonymous.github.io/ComfyUI_examples/sdturbo/) +- [AuraFlow](https://comfyanonymous.github.io/ComfyUI_examples/aura_flow/) - Latent previews with [TAESD](#how-to-show-high-quality-previews) - Starts up very fast. - Works fully offline: will never download anything. @@ -225,12 +226,11 @@ Use `--tls-keyfile key.pem --tls-certfile cert.pem` to enable TLS/SSL, the app w [Matrix space: #comfyui_space:matrix.org](https://app.element.io/#/room/%23comfyui_space%3Amatrix.org) (it's like discord but open source). -# QA +See also: [https://www.comfy.org/](https://www.comfy.org/) -### Why did you make this? +# QA -I wanted to learn how Stable Diffusion worked in detail. I also wanted something clean and powerful that would let me experiment with SD without restrictions. +### Which GPU should I buy for this? -### Who is this for? +[See this page for some recommendations](https://github.com/comfyanonymous/ComfyUI/wiki/Which-GPU-should-I-buy-for-ComfyUI) -This is for anyone that wants to make complex workflows with SD or that wants to learn more how SD works. The interface follows closely how SD works and the code should be much more simple to understand than other SD UIs. diff --git a/app/__init__.py b/app/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/app/frontend_management.py b/app/frontend_management.py new file mode 100644 index 000000000000..fb57b23f383d --- /dev/null +++ b/app/frontend_management.py @@ -0,0 +1,188 @@ +from __future__ import annotations +import argparse +import logging +import os +import re +import tempfile +import zipfile +from dataclasses import dataclass +from functools import cached_property +from pathlib import Path +from typing import TypedDict + +import requests +from typing_extensions import NotRequired +from comfy.cli_args import DEFAULT_VERSION_STRING + + +REQUEST_TIMEOUT = 10 # seconds + + +class Asset(TypedDict): + url: str + + +class Release(TypedDict): + id: int + tag_name: str + name: str + prerelease: bool + created_at: str + published_at: str + body: str + assets: NotRequired[list[Asset]] + + +@dataclass +class FrontEndProvider: + owner: str + repo: str + + @property + def folder_name(self) -> str: + return f"{self.owner}_{self.repo}" + + @property + def release_url(self) -> str: + return f"https://api.github.com/repos/{self.owner}/{self.repo}/releases" + + @cached_property + def all_releases(self) -> list[Release]: + releases = [] + api_url = self.release_url + while api_url: + response = requests.get(api_url, timeout=REQUEST_TIMEOUT) + response.raise_for_status() # Raises an HTTPError if the response was an error + releases.extend(response.json()) + # GitHub uses the Link header to provide pagination links. Check if it exists and update api_url accordingly. + if "next" in response.links: + api_url = response.links["next"]["url"] + else: + api_url = None + return releases + + @cached_property + def latest_release(self) -> Release: + latest_release_url = f"{self.release_url}/latest" + response = requests.get(latest_release_url, timeout=REQUEST_TIMEOUT) + response.raise_for_status() # Raises an HTTPError if the response was an error + return response.json() + + def get_release(self, version: str) -> Release: + if version == "latest": + return self.latest_release + else: + for release in self.all_releases: + if release["tag_name"] in [version, f"v{version}"]: + return release + raise ValueError(f"Version {version} not found in releases") + + +def download_release_asset_zip(release: Release, destination_path: str) -> None: + """Download dist.zip from github release.""" + asset_url = None + for asset in release.get("assets", []): + if asset["name"] == "dist.zip": + asset_url = asset["url"] + break + + if not asset_url: + raise ValueError("dist.zip not found in the release assets") + + # Use a temporary file to download the zip content + with tempfile.TemporaryFile() as tmp_file: + headers = {"Accept": "application/octet-stream"} + response = requests.get( + asset_url, headers=headers, allow_redirects=True, timeout=REQUEST_TIMEOUT + ) + response.raise_for_status() # Ensure we got a successful response + + # Write the content to the temporary file + tmp_file.write(response.content) + + # Go back to the beginning of the temporary file + tmp_file.seek(0) + + # Extract the zip file content to the destination path + with zipfile.ZipFile(tmp_file, "r") as zip_ref: + zip_ref.extractall(destination_path) + + +class FrontendManager: + DEFAULT_FRONTEND_PATH = str(Path(__file__).parents[1] / "web") + CUSTOM_FRONTENDS_ROOT = str(Path(__file__).parents[1] / "web_custom_versions") + + @classmethod + def parse_version_string(cls, value: str) -> tuple[str, str, str]: + """ + Args: + value (str): The version string to parse. + + Returns: + tuple[str, str]: A tuple containing provider name and version. + + Raises: + argparse.ArgumentTypeError: If the version string is invalid. + """ + VERSION_PATTERN = r"^([a-zA-Z0-9][a-zA-Z0-9-]{0,38})/([a-zA-Z0-9_.-]+)@(v?\d+\.\d+\.\d+|latest)$" + match_result = re.match(VERSION_PATTERN, value) + if match_result is None: + raise argparse.ArgumentTypeError(f"Invalid version string: {value}") + + return match_result.group(1), match_result.group(2), match_result.group(3) + + @classmethod + def init_frontend_unsafe(cls, version_string: str) -> str: + """ + Initializes the frontend for the specified version. + + Args: + version_string (str): The version string. + + Returns: + str: The path to the initialized frontend. + + Raises: + Exception: If there is an error during the initialization process. + main error source might be request timeout or invalid URL. + """ + if version_string == DEFAULT_VERSION_STRING: + return cls.DEFAULT_FRONTEND_PATH + + repo_owner, repo_name, version = cls.parse_version_string(version_string) + provider = FrontEndProvider(repo_owner, repo_name) + release = provider.get_release(version) + + semantic_version = release["tag_name"].lstrip("v") + web_root = str( + Path(cls.CUSTOM_FRONTENDS_ROOT) / provider.folder_name / semantic_version + ) + if not os.path.exists(web_root): + os.makedirs(web_root, exist_ok=True) + logging.info( + "Downloading frontend(%s) version(%s) to (%s)", + provider.folder_name, + semantic_version, + web_root, + ) + logging.debug(release) + download_release_asset_zip(release, destination_path=web_root) + return web_root + + @classmethod + def init_frontend(cls, version_string: str) -> str: + """ + Initializes the frontend with the specified version string. + + Args: + version_string (str): The version string to initialize the frontend with. + + Returns: + str: The path of the initialized frontend. + """ + try: + return cls.init_frontend_unsafe(version_string) + except Exception as e: + logging.error("Failed to initialize frontend: %s", e) + logging.info("Falling back to the default frontend.") + return cls.DEFAULT_FRONTEND_PATH diff --git a/comfy/cldm/cldm.py b/comfy/cldm/cldm.py index 28076dd9251e..1d7294bd63de 100644 --- a/comfy/cldm/cldm.py +++ b/comfy/cldm/cldm.py @@ -13,7 +13,46 @@ from ..ldm.modules.attention import SpatialTransformer from ..ldm.modules.diffusionmodules.openaimodel import UNetModel, TimestepEmbedSequential, ResBlock, Downsample from ..ldm.util import exists +from collections import OrderedDict import comfy.ops +from comfy.ldm.modules.attention import optimized_attention + +class OptimizedAttention(nn.Module): + def __init__(self, c, nhead, dropout=0.0, dtype=None, device=None, operations=None): + super().__init__() + self.heads = nhead + self.c = c + + self.in_proj = operations.Linear(c, c * 3, bias=True, dtype=dtype, device=device) + self.out_proj = operations.Linear(c, c, bias=True, dtype=dtype, device=device) + + def forward(self, x): + x = self.in_proj(x) + q, k, v = x.split(self.c, dim=2) + out = optimized_attention(q, k, v, self.heads) + return self.out_proj(out) + +class QuickGELU(nn.Module): + def forward(self, x: torch.Tensor): + return x * torch.sigmoid(1.702 * x) + +class ResBlockUnionControlnet(nn.Module): + def __init__(self, dim, nhead, dtype=None, device=None, operations=None): + super().__init__() + self.attn = OptimizedAttention(dim, nhead, dtype=dtype, device=device, operations=operations) + self.ln_1 = operations.LayerNorm(dim, dtype=dtype, device=device) + self.mlp = nn.Sequential( + OrderedDict([("c_fc", operations.Linear(dim, dim * 4, dtype=dtype, device=device)), ("gelu", QuickGELU()), + ("c_proj", operations.Linear(dim * 4, dim, dtype=dtype, device=device))])) + self.ln_2 = operations.LayerNorm(dim, dtype=dtype, device=device) + + def attention(self, x: torch.Tensor): + return self.attn(x) + + def forward(self, x: torch.Tensor): + x = x + self.attention(self.ln_1(x)) + x = x + self.mlp(self.ln_2(x)) + return x class ControlledUnetModel(UNetModel): #implemented in the ldm unet @@ -53,6 +92,7 @@ def __init__( transformer_depth_middle=None, transformer_depth_output=None, attn_precision=None, + union_controlnet_num_control_type=None, device=None, operations=comfy.ops.disable_weight_init, **kwargs, @@ -280,6 +320,65 @@ def __init__( self.middle_block_out = self.make_zero_conv(ch, operations=operations, dtype=self.dtype, device=device) self._feature_size += ch + if union_controlnet_num_control_type is not None: + self.num_control_type = union_controlnet_num_control_type + num_trans_channel = 320 + num_trans_head = 8 + num_trans_layer = 1 + num_proj_channel = 320 + # task_scale_factor = num_trans_channel ** 0.5 + self.task_embedding = nn.Parameter(torch.empty(self.num_control_type, num_trans_channel, dtype=self.dtype, device=device)) + + self.transformer_layes = nn.Sequential(*[ResBlockUnionControlnet(num_trans_channel, num_trans_head, dtype=self.dtype, device=device, operations=operations) for _ in range(num_trans_layer)]) + self.spatial_ch_projs = operations.Linear(num_trans_channel, num_proj_channel, dtype=self.dtype, device=device) + #----------------------------------------------------------------------------------------------------- + + control_add_embed_dim = 256 + class ControlAddEmbedding(nn.Module): + def __init__(self, in_dim, out_dim, num_control_type, dtype=None, device=None, operations=None): + super().__init__() + self.num_control_type = num_control_type + self.in_dim = in_dim + self.linear_1 = operations.Linear(in_dim * num_control_type, out_dim, dtype=dtype, device=device) + self.linear_2 = operations.Linear(out_dim, out_dim, dtype=dtype, device=device) + def forward(self, control_type, dtype, device): + c_type = torch.zeros((self.num_control_type,), device=device) + c_type[control_type] = 1.0 + c_type = timestep_embedding(c_type.flatten(), self.in_dim, repeat_only=False).to(dtype).reshape((-1, self.num_control_type * self.in_dim)) + return self.linear_2(torch.nn.functional.silu(self.linear_1(c_type))) + + self.control_add_embedding = ControlAddEmbedding(control_add_embed_dim, time_embed_dim, self.num_control_type, dtype=self.dtype, device=device, operations=operations) + else: + self.task_embedding = None + self.control_add_embedding = None + + def union_controlnet_merge(self, hint, control_type, emb, context): + # Equivalent to: https://github.com/xinsir6/ControlNetPlus/tree/main + inputs = [] + condition_list = [] + + for idx in range(min(1, len(control_type))): + controlnet_cond = self.input_hint_block(hint[idx], emb, context) + feat_seq = torch.mean(controlnet_cond, dim=(2, 3)) + if idx < len(control_type): + feat_seq += self.task_embedding[control_type[idx]].to(dtype=feat_seq.dtype, device=feat_seq.device) + + inputs.append(feat_seq.unsqueeze(1)) + condition_list.append(controlnet_cond) + + x = torch.cat(inputs, dim=1) + x = self.transformer_layes(x) + controlnet_cond_fuser = None + for idx in range(len(control_type)): + alpha = self.spatial_ch_projs(x[:, idx]) + alpha = alpha.unsqueeze(-1).unsqueeze(-1) + o = condition_list[idx] + alpha + if controlnet_cond_fuser is None: + controlnet_cond_fuser = o + else: + controlnet_cond_fuser += o + return controlnet_cond_fuser + def make_zero_conv(self, channels, operations=None, dtype=None, device=None): return TimestepEmbedSequential(operations.conv_nd(self.dims, channels, channels, 1, padding=0, dtype=dtype, device=device)) @@ -287,9 +386,21 @@ def forward(self, x, hint, timesteps, context, y=None, **kwargs): t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False).to(x.dtype) emb = self.time_embed(t_emb) - guided_hint = self.input_hint_block(hint, emb, context) + guided_hint = None + if self.control_add_embedding is not None: #Union Controlnet + control_type = kwargs.get("control_type", []) + + emb += self.control_add_embedding(control_type, emb.dtype, emb.device) + if len(control_type) > 0: + if len(hint.shape) < 5: + hint = hint.unsqueeze(dim=0) + guided_hint = self.union_controlnet_merge(hint, control_type, emb, context) + + if guided_hint is None: + guided_hint = self.input_hint_block(hint, emb, context) - outs = [] + out_output = [] + out_middle = [] hs = [] if self.num_classes is not None: @@ -304,10 +415,10 @@ def forward(self, x, hint, timesteps, context, y=None, **kwargs): guided_hint = None else: h = module(h, emb, context) - outs.append(zero_conv(h, emb, context)) + out_output.append(zero_conv(h, emb, context)) h = self.middle_block(h, emb, context) - outs.append(self.middle_block_out(h, emb, context)) + out_middle.append(self.middle_block_out(h, emb, context)) - return outs + return {"middle": out_middle, "output": out_output} diff --git a/comfy/cldm/mmdit.py b/comfy/cldm/mmdit.py new file mode 100644 index 000000000000..025c2fb5dff3 --- /dev/null +++ b/comfy/cldm/mmdit.py @@ -0,0 +1,77 @@ +import torch +from typing import Dict, Optional +import comfy.ldm.modules.diffusionmodules.mmdit + +class ControlNet(comfy.ldm.modules.diffusionmodules.mmdit.MMDiT): + def __init__( + self, + num_blocks = None, + dtype = None, + device = None, + operations = None, + **kwargs, + ): + super().__init__(dtype=dtype, device=device, operations=operations, final_layer=False, num_blocks=num_blocks, **kwargs) + # controlnet_blocks + self.controlnet_blocks = torch.nn.ModuleList([]) + for _ in range(len(self.joint_blocks)): + self.controlnet_blocks.append(operations.Linear(self.hidden_size, self.hidden_size, device=device, dtype=dtype)) + + self.pos_embed_input = comfy.ldm.modules.diffusionmodules.mmdit.PatchEmbed( + None, + self.patch_size, + self.in_channels, + self.hidden_size, + bias=True, + strict_img_size=False, + dtype=dtype, + device=device, + operations=operations + ) + + def forward( + self, + x: torch.Tensor, + timesteps: torch.Tensor, + y: Optional[torch.Tensor] = None, + context: Optional[torch.Tensor] = None, + hint = None, + ) -> torch.Tensor: + + #weird sd3 controlnet specific stuff + y = torch.zeros_like(y) + + if self.context_processor is not None: + context = self.context_processor(context) + + hw = x.shape[-2:] + x = self.x_embedder(x) + self.cropped_pos_embed(hw, device=x.device).to(dtype=x.dtype, device=x.device) + x += self.pos_embed_input(hint) + + c = self.t_embedder(timesteps, dtype=x.dtype) + if y is not None and self.y_embedder is not None: + y = self.y_embedder(y) + c = c + y + + if context is not None: + context = self.context_embedder(context) + + output = [] + + blocks = len(self.joint_blocks) + for i in range(blocks): + context, x = self.joint_blocks[i]( + context, + x, + c=c, + use_checkpoint=self.use_checkpoint, + ) + + out = self.controlnet_blocks[i](x) + count = self.depth // blocks + if i == blocks - 1: + count -= 1 + for j in range(count): + output.append(out) + + return {"output": output} diff --git a/comfy/cli_args.py b/comfy/cli_args.py index fb0d37ce7508..2397de3d624a 100644 --- a/comfy/cli_args.py +++ b/comfy/cli_args.py @@ -1,7 +1,10 @@ import argparse import enum +import os +from typing import Optional import comfy.options + class EnumAction(argparse.Action): """ Argparse action for handling Enums @@ -109,6 +112,7 @@ class LatentPreviewMethod(enum.Enum): vram_group.add_argument("--novram", action="store_true", help="When lowvram isn't enough.") vram_group.add_argument("--cpu", action="store_true", help="To use the CPU for everything (slow).") +parser.add_argument("--default-hashing-function", type=str, choices=['md5', 'sha1', 'sha256', 'sha512'], default='sha256', help="Allows you to choose the hash function to use for duplicate filename / contents comparison. Default is sha256.") parser.add_argument("--disable-smart-memory", action="store_true", help="Force ComfyUI to agressively offload to regular ram instead of keeping models in vram when it can.") parser.add_argument("--deterministic", action="store_true", help="Make pytorch use slower deterministic algorithms when it can. Note that this might not make images deterministic in all cases.") @@ -118,11 +122,44 @@ class LatentPreviewMethod(enum.Enum): parser.add_argument("--windows-standalone-build", action="store_true", help="Windows standalone build: Enable convenient things that most people using the standalone windows build will probably enjoy (like auto opening the page on startup).") parser.add_argument("--disable-metadata", action="store_true", help="Disable saving prompt metadata in files.") +parser.add_argument("--disable-all-custom-nodes", action="store_true", help="Disable loading all custom nodes.") parser.add_argument("--multi-user", action="store_true", help="Enables per-user storage.") parser.add_argument("--verbose", action="store_true", help="Enables more debug prints.") +# The default built-in provider hosted under web/ +DEFAULT_VERSION_STRING = "comfyanonymous/ComfyUI@latest" + +parser.add_argument( + "--front-end-version", + type=str, + default=DEFAULT_VERSION_STRING, + help=""" + Specifies the version of the frontend to be used. This command needs internet connectivity to query and + download available frontend implementations from GitHub releases. + + The version string should be in the format of: + [repoOwner]/[repoName]@[version] + where version is one of: "latest" or a valid version number (e.g. "1.0.0") + """, +) + +def is_valid_directory(path: Optional[str]) -> Optional[str]: + """Validate if the given path is a directory.""" + if path is None: + return None + + if not os.path.isdir(path): + raise argparse.ArgumentTypeError(f"{path} is not a valid directory.") + return path + +parser.add_argument( + "--front-end-root", + type=is_valid_directory, + default=None, + help="The local filesystem path to the directory where the frontend is located. Overrides --front-end-version.", +) if comfy.options.args_parsing: args = parser.parse_args() diff --git a/comfy/clip_model.py b/comfy/clip_model.py index 14f43c5687cb..8bfe504f2900 100644 --- a/comfy/clip_model.py +++ b/comfy/clip_model.py @@ -121,7 +121,7 @@ def __init__(self, config_dict, dtype, device, operations): self.text_model = CLIPTextModel_(config_dict, dtype, device, operations) embed_dim = config_dict["hidden_size"] self.text_projection = operations.Linear(embed_dim, embed_dim, bias=False, dtype=dtype, device=device) - self.text_projection.weight.copy_(torch.eye(embed_dim)) + self.text_projection.weight.data.copy_(torch.eye(embed_dim)) self.dtype = dtype def get_input_embeddings(self): diff --git a/comfy/clip_vision.py b/comfy/clip_vision.py index acc86be85566..20dc3345d0fe 100644 --- a/comfy/clip_vision.py +++ b/comfy/clip_vision.py @@ -34,6 +34,7 @@ def __init__(self, json_config): with open(json_config) as f: config = json.load(f) + self.image_size = config.get("image_size", 224) self.load_device = comfy.model_management.text_encoder_device() offload_device = comfy.model_management.text_encoder_offload_device() self.dtype = comfy.model_management.text_encoder_dtype(self.load_device) @@ -50,7 +51,7 @@ def get_sd(self): def encode_image(self, image): comfy.model_management.load_model_gpu(self.patcher) - pixel_values = clip_preprocess(image.to(self.load_device)).float() + pixel_values = clip_preprocess(image.to(self.load_device), size=self.image_size).float() out = self.model(pixel_values=pixel_values, intermediate_output=-2) outputs = Output() @@ -93,7 +94,10 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False): elif "vision_model.encoder.layers.30.layer_norm1.weight" in sd: json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_h.json") elif "vision_model.encoder.layers.22.layer_norm1.weight" in sd: - json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl.json") + if sd["vision_model.embeddings.position_embedding.weight"].shape[0] == 577: + json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336.json") + else: + json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl.json") else: return None diff --git a/comfy/clip_vision_config_vitl_336.json b/comfy/clip_vision_config_vitl_336.json new file mode 100644 index 000000000000..f26945273d99 --- /dev/null +++ b/comfy/clip_vision_config_vitl_336.json @@ -0,0 +1,18 @@ +{ + "attention_dropout": 0.0, + "dropout": 0.0, + "hidden_act": "quick_gelu", + "hidden_size": 1024, + "image_size": 336, + "initializer_factor": 1.0, + "initializer_range": 0.02, + "intermediate_size": 4096, + "layer_norm_eps": 1e-5, + "model_type": "clip_vision_model", + "num_attention_heads": 16, + "num_channels": 3, + "num_hidden_layers": 24, + "patch_size": 14, + "projection_dim": 768, + "torch_dtype": "float32" +} diff --git a/comfy/controlnet.py b/comfy/controlnet.py index 8cf4a61a6833..12e5f16c88e5 100644 --- a/comfy/controlnet.py +++ b/comfy/controlnet.py @@ -7,10 +7,12 @@ import comfy.model_detection import comfy.model_patcher import comfy.ops +import comfy.latent_formats import comfy.cldm.cldm import comfy.t2i_adapter.adapter import comfy.ldm.cascade.controlnet +import comfy.cldm.mmdit def broadcast_image_to(tensor, target_batch_size, batched_number): @@ -37,20 +39,25 @@ def __init__(self, device=None): self.cond_hint = None self.strength = 1.0 self.timestep_percent_range = (0.0, 1.0) + self.latent_format = None + self.vae = None self.global_average_pooling = False self.timestep_range = None self.compression_ratio = 8 self.upscale_algorithm = 'nearest-exact' + self.extra_args = {} if device is None: device = comfy.model_management.get_torch_device() self.device = device self.previous_controlnet = None - def set_cond_hint(self, cond_hint, strength=1.0, timestep_percent_range=(0.0, 1.0)): + def set_cond_hint(self, cond_hint, strength=1.0, timestep_percent_range=(0.0, 1.0), vae=None): self.cond_hint_original = cond_hint self.strength = strength self.timestep_percent_range = timestep_percent_range + if self.latent_format is not None: + self.vae = vae return self def pre_run(self, model, percent_to_timestep_function): @@ -83,43 +90,36 @@ def copy_to(self, c): c.global_average_pooling = self.global_average_pooling c.compression_ratio = self.compression_ratio c.upscale_algorithm = self.upscale_algorithm + c.latent_format = self.latent_format + c.extra_args = self.extra_args.copy() + c.vae = self.vae def inference_memory_requirements(self, dtype): if self.previous_controlnet is not None: return self.previous_controlnet.inference_memory_requirements(dtype) return 0 - def control_merge(self, control_input, control_output, control_prev, output_dtype): + def control_merge(self, control, control_prev, output_dtype): out = {'input':[], 'middle':[], 'output': []} - if control_input is not None: - for i in range(len(control_input)): - key = 'input' - x = control_input[i] - if x is not None: - x *= self.strength - if x.dtype != output_dtype: - x = x.to(output_dtype) - out[key].insert(0, x) - - if control_output is not None: + for key in control: + control_output = control[key] + applied_to = set() for i in range(len(control_output)): - if i == (len(control_output) - 1): - key = 'middle' - index = 0 - else: - key = 'output' - index = i x = control_output[i] if x is not None: if self.global_average_pooling: x = torch.mean(x, dim=(2, 3), keepdim=True).repeat(1, 1, x.shape[2], x.shape[3]) - x *= self.strength + if x not in applied_to: #memory saving strategy, allow shared tensors and only apply strength to shared tensors once + applied_to.add(x) + x *= self.strength + if x.dtype != output_dtype: x = x.to(output_dtype) out[key].append(x) + if control_prev is not None: for x in ['input', 'middle', 'output']: o = out[x] @@ -134,20 +134,26 @@ def control_merge(self, control_input, control_output, control_prev, output_dtyp if o[i].shape[0] < prev_val.shape[0]: o[i] = prev_val + o[i] else: - o[i] += prev_val + o[i] = prev_val + o[i] #TODO: change back to inplace add if shared tensors stop being an issue return out + def set_extra_arg(self, argument, value=None): + self.extra_args[argument] = value + + class ControlNet(ControlBase): - def __init__(self, control_model=None, global_average_pooling=False, device=None, load_device=None, manual_cast_dtype=None): + def __init__(self, control_model=None, global_average_pooling=False, compression_ratio=8, latent_format=None, device=None, load_device=None, manual_cast_dtype=None): super().__init__(device) self.control_model = control_model self.load_device = load_device if control_model is not None: self.control_model_wrapped = comfy.model_patcher.ModelPatcher(self.control_model, load_device=load_device, offload_device=comfy.model_management.unet_offload_device()) + self.compression_ratio = compression_ratio self.global_average_pooling = global_average_pooling self.model_sampling_current = None self.manual_cast_dtype = manual_cast_dtype + self.latent_format = latent_format def get_control(self, x_noisy, t, cond, batched_number): control_prev = None @@ -170,7 +176,17 @@ def get_control(self, x_noisy, t, cond, batched_number): if self.cond_hint is not None: del self.cond_hint self.cond_hint = None - self.cond_hint = comfy.utils.common_upscale(self.cond_hint_original, x_noisy.shape[3] * self.compression_ratio, x_noisy.shape[2] * self.compression_ratio, self.upscale_algorithm, "center").to(dtype).to(self.device) + compression_ratio = self.compression_ratio + if self.vae is not None: + compression_ratio *= self.vae.downscale_ratio + self.cond_hint = comfy.utils.common_upscale(self.cond_hint_original, x_noisy.shape[3] * compression_ratio, x_noisy.shape[2] * compression_ratio, self.upscale_algorithm, "center") + if self.vae is not None: + loaded_models = comfy.model_management.loaded_models(only_currently_used=True) + self.cond_hint = self.vae.encode(self.cond_hint.movedim(1, -1)) + comfy.model_management.load_models_gpu(loaded_models) + if self.latent_format is not None: + self.cond_hint = self.latent_format.process_in(self.cond_hint) + self.cond_hint = self.cond_hint.to(device=self.device, dtype=dtype) if x_noisy.shape[0] != self.cond_hint.shape[0]: self.cond_hint = broadcast_image_to(self.cond_hint, x_noisy.shape[0], batched_number) @@ -181,8 +197,8 @@ def get_control(self, x_noisy, t, cond, batched_number): timestep = self.model_sampling_current.timestep(t) x_noisy = self.model_sampling_current.calculate_input(t, x_noisy) - control = self.control_model(x=x_noisy.to(dtype), hint=self.cond_hint, timesteps=timestep.float(), context=context.to(dtype), y=y) - return self.control_merge(None, control, control_prev, output_dtype) + control = self.control_model(x=x_noisy.to(dtype), hint=self.cond_hint, timesteps=timestep.float(), context=context.to(dtype), y=y, **self.extra_args) + return self.control_merge(control, control_prev, output_dtype) def copy(self): c = ControlNet(None, global_average_pooling=self.global_average_pooling, load_device=self.load_device, manual_cast_dtype=self.manual_cast_dtype) @@ -322,6 +338,39 @@ def get_models(self): def inference_memory_requirements(self, dtype): return comfy.utils.calculate_parameters(self.control_weights) * comfy.model_management.dtype_size(dtype) + ControlBase.inference_memory_requirements(self, dtype) +def load_controlnet_mmdit(sd): + new_sd = comfy.model_detection.convert_diffusers_mmdit(sd, "") + model_config = comfy.model_detection.model_config_from_unet(new_sd, "", True) + num_blocks = comfy.model_detection.count_blocks(new_sd, 'joint_blocks.{}.') + for k in sd: + new_sd[k] = sd[k] + + supported_inference_dtypes = model_config.supported_inference_dtypes + + controlnet_config = model_config.unet_config + unet_dtype = comfy.model_management.unet_dtype(supported_dtypes=supported_inference_dtypes) + load_device = comfy.model_management.get_torch_device() + manual_cast_dtype = comfy.model_management.unet_manual_cast(unet_dtype, load_device) + if manual_cast_dtype is not None: + operations = comfy.ops.manual_cast + else: + operations = comfy.ops.disable_weight_init + + control_model = comfy.cldm.mmdit.ControlNet(num_blocks=num_blocks, operations=operations, device=load_device, dtype=unet_dtype, **controlnet_config) + missing, unexpected = control_model.load_state_dict(new_sd, strict=False) + + if len(missing) > 0: + logging.warning("missing controlnet keys: {}".format(missing)) + + if len(unexpected) > 0: + logging.debug("unexpected controlnet keys: {}".format(unexpected)) + + latent_format = comfy.latent_formats.SD3() + latent_format.shift_factor = 0 #SD3 controlnet weirdness + control = ControlNet(control_model, compression_ratio=1, latent_format=latent_format, load_device=load_device, manual_cast_dtype=manual_cast_dtype) + return control + + def load_controlnet(ckpt_path, model=None): controlnet_data = comfy.utils.load_torch_file(ckpt_path, safe_load=True) if "lora_controlnet" in controlnet_data: @@ -370,10 +419,18 @@ def load_controlnet(ckpt_path, model=None): if k in controlnet_data: new_sd[diffusers_keys[k]] = controlnet_data.pop(k) + if "control_add_embedding.linear_1.bias" in controlnet_data: #Union Controlnet + controlnet_config["union_controlnet_num_control_type"] = controlnet_data["task_embedding"].shape[0] + for k in list(controlnet_data.keys()): + new_k = k.replace('.attn.in_proj_', '.attn.in_proj.') + new_sd[new_k] = controlnet_data.pop(k) + leftover_keys = controlnet_data.keys() if len(leftover_keys) > 0: logging.warning("leftover keys: {}".format(leftover_keys)) controlnet_data = new_sd + elif "controlnet_blocks.0.weight" in controlnet_data: #SD3 diffusers format + return load_controlnet_mmdit(controlnet_data) pth_key = 'control_model.zero_convs.0.0.weight' pth = False @@ -490,12 +547,11 @@ def get_control(self, x_noisy, t, cond, batched_number): self.control_input = self.t2i_model(self.cond_hint.to(x_noisy.dtype)) self.t2i_model.cpu() - control_input = list(map(lambda a: None if a is None else a.clone(), self.control_input)) - mid = None - if self.t2i_model.xl == True: - mid = control_input[-1:] - control_input = control_input[:-1] - return self.control_merge(control_input, mid, control_prev, x_noisy.dtype) + control_input = {} + for k in self.control_input: + control_input[k] = list(map(lambda a: None if a is None else a.clone(), self.control_input[k])) + + return self.control_merge(control_input, control_prev, x_noisy.dtype) def copy(self): c = T2IAdapter(self.t2i_model, self.channels_in, self.compression_ratio, self.upscale_algorithm) diff --git a/comfy/k_diffusion/sampling.py b/comfy/k_diffusion/sampling.py index f8091bb3fc18..763d8cc78d3d 100644 --- a/comfy/k_diffusion/sampling.py +++ b/comfy/k_diffusion/sampling.py @@ -998,7 +998,7 @@ def sample_deis(model, x, sigmas, extra_args=None, callback=None, disable=None, return x_next @torch.no_grad() -def sample_euler_pp(model, x, sigmas, extra_args=None, callback=None, disable=None): +def sample_euler_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None): extra_args = {} if extra_args is None else extra_args temp = [0] @@ -1013,16 +1013,16 @@ def post_cfg_function(args): for i in trange(len(sigmas) - 1, disable=disable): sigma_hat = sigmas[i] denoised = model(x, sigma_hat * s_in, **extra_args) - d = to_d(x - denoised + temp[0], sigma_hat, denoised) + d = to_d(x, sigma_hat, temp[0]) if callback is not None: callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigma_hat, 'denoised': denoised}) dt = sigmas[i + 1] - sigma_hat # Euler method - x = x + d * dt + x = denoised + d * sigmas[i + 1] return x @torch.no_grad() -def sample_euler_ancestral_pp(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None): +def sample_euler_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None): """Ancestral sampling with Euler method steps.""" extra_args = {} if extra_args is None else extra_args noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler @@ -1041,10 +1041,10 @@ def post_cfg_function(args): sigma_down, sigma_up = get_ancestral_step(sigmas[i], sigmas[i + 1], eta=eta) if callback is not None: callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised}) - d = to_d(x - denoised + temp[0], sigmas[i], denoised) + d = to_d(x, sigmas[i], temp[0]) # Euler method dt = sigma_down - sigmas[i] - x = x + d * dt + x = denoised + d * sigma_down if sigmas[i + 1] > 0: x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up return x diff --git a/comfy/ldm/aura/mmdit.py b/comfy/ldm/aura/mmdit.py new file mode 100644 index 000000000000..c465619bd0aa --- /dev/null +++ b/comfy/ldm/aura/mmdit.py @@ -0,0 +1,479 @@ +#AuraFlow MMDiT +#Originally written by the AuraFlow Authors + +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from comfy.ldm.modules.attention import optimized_attention + +def modulate(x, shift, scale): + return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1) + + +def find_multiple(n: int, k: int) -> int: + if n % k == 0: + return n + return n + k - (n % k) + + +class MLP(nn.Module): + def __init__(self, dim, hidden_dim=None, dtype=None, device=None, operations=None) -> None: + super().__init__() + if hidden_dim is None: + hidden_dim = 4 * dim + + n_hidden = int(2 * hidden_dim / 3) + n_hidden = find_multiple(n_hidden, 256) + + self.c_fc1 = operations.Linear(dim, n_hidden, bias=False, dtype=dtype, device=device) + self.c_fc2 = operations.Linear(dim, n_hidden, bias=False, dtype=dtype, device=device) + self.c_proj = operations.Linear(n_hidden, dim, bias=False, dtype=dtype, device=device) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = F.silu(self.c_fc1(x)) * self.c_fc2(x) + x = self.c_proj(x) + return x + + +class MultiHeadLayerNorm(nn.Module): + def __init__(self, hidden_size=None, eps=1e-5, dtype=None, device=None): + # Copy pasta from https://github.com/huggingface/transformers/blob/e5f71ecaae50ea476d1e12351003790273c4b2ed/src/transformers/models/cohere/modeling_cohere.py#L78 + + super().__init__() + self.weight = nn.Parameter(torch.empty(hidden_size, dtype=dtype, device=device)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + mean = hidden_states.mean(-1, keepdim=True) + variance = (hidden_states - mean).pow(2).mean(-1, keepdim=True) + hidden_states = (hidden_states - mean) * torch.rsqrt( + variance + self.variance_epsilon + ) + hidden_states = self.weight.to(torch.float32) * hidden_states + return hidden_states.to(input_dtype) + +class SingleAttention(nn.Module): + def __init__(self, dim, n_heads, mh_qknorm=False, dtype=None, device=None, operations=None): + super().__init__() + + self.n_heads = n_heads + self.head_dim = dim // n_heads + + # this is for cond + self.w1q = operations.Linear(dim, dim, bias=False, dtype=dtype, device=device) + self.w1k = operations.Linear(dim, dim, bias=False, dtype=dtype, device=device) + self.w1v = operations.Linear(dim, dim, bias=False, dtype=dtype, device=device) + self.w1o = operations.Linear(dim, dim, bias=False, dtype=dtype, device=device) + + self.q_norm1 = ( + MultiHeadLayerNorm((self.n_heads, self.head_dim), dtype=dtype, device=device) + if mh_qknorm + else operations.LayerNorm(self.head_dim, elementwise_affine=False, dtype=dtype, device=device) + ) + self.k_norm1 = ( + MultiHeadLayerNorm((self.n_heads, self.head_dim), dtype=dtype, device=device) + if mh_qknorm + else operations.LayerNorm(self.head_dim, elementwise_affine=False, dtype=dtype, device=device) + ) + + #@torch.compile() + def forward(self, c): + + bsz, seqlen1, _ = c.shape + + q, k, v = self.w1q(c), self.w1k(c), self.w1v(c) + q = q.view(bsz, seqlen1, self.n_heads, self.head_dim) + k = k.view(bsz, seqlen1, self.n_heads, self.head_dim) + v = v.view(bsz, seqlen1, self.n_heads, self.head_dim) + q, k = self.q_norm1(q), self.k_norm1(k) + + output = optimized_attention(q.permute(0, 2, 1, 3), k.permute(0, 2, 1, 3), v.permute(0, 2, 1, 3), self.n_heads, skip_reshape=True) + c = self.w1o(output) + return c + + + +class DoubleAttention(nn.Module): + def __init__(self, dim, n_heads, mh_qknorm=False, dtype=None, device=None, operations=None): + super().__init__() + + self.n_heads = n_heads + self.head_dim = dim // n_heads + + # this is for cond + self.w1q = operations.Linear(dim, dim, bias=False, dtype=dtype, device=device) + self.w1k = operations.Linear(dim, dim, bias=False, dtype=dtype, device=device) + self.w1v = operations.Linear(dim, dim, bias=False, dtype=dtype, device=device) + self.w1o = operations.Linear(dim, dim, bias=False, dtype=dtype, device=device) + + # this is for x + self.w2q = operations.Linear(dim, dim, bias=False, dtype=dtype, device=device) + self.w2k = operations.Linear(dim, dim, bias=False, dtype=dtype, device=device) + self.w2v = operations.Linear(dim, dim, bias=False, dtype=dtype, device=device) + self.w2o = operations.Linear(dim, dim, bias=False, dtype=dtype, device=device) + + self.q_norm1 = ( + MultiHeadLayerNorm((self.n_heads, self.head_dim), dtype=dtype, device=device) + if mh_qknorm + else operations.LayerNorm(self.head_dim, elementwise_affine=False, dtype=dtype, device=device) + ) + self.k_norm1 = ( + MultiHeadLayerNorm((self.n_heads, self.head_dim), dtype=dtype, device=device) + if mh_qknorm + else operations.LayerNorm(self.head_dim, elementwise_affine=False, dtype=dtype, device=device) + ) + + self.q_norm2 = ( + MultiHeadLayerNorm((self.n_heads, self.head_dim), dtype=dtype, device=device) + if mh_qknorm + else operations.LayerNorm(self.head_dim, elementwise_affine=False, dtype=dtype, device=device) + ) + self.k_norm2 = ( + MultiHeadLayerNorm((self.n_heads, self.head_dim), dtype=dtype, device=device) + if mh_qknorm + else operations.LayerNorm(self.head_dim, elementwise_affine=False, dtype=dtype, device=device) + ) + + + #@torch.compile() + def forward(self, c, x): + + bsz, seqlen1, _ = c.shape + bsz, seqlen2, _ = x.shape + seqlen = seqlen1 + seqlen2 + + cq, ck, cv = self.w1q(c), self.w1k(c), self.w1v(c) + cq = cq.view(bsz, seqlen1, self.n_heads, self.head_dim) + ck = ck.view(bsz, seqlen1, self.n_heads, self.head_dim) + cv = cv.view(bsz, seqlen1, self.n_heads, self.head_dim) + cq, ck = self.q_norm1(cq), self.k_norm1(ck) + + xq, xk, xv = self.w2q(x), self.w2k(x), self.w2v(x) + xq = xq.view(bsz, seqlen2, self.n_heads, self.head_dim) + xk = xk.view(bsz, seqlen2, self.n_heads, self.head_dim) + xv = xv.view(bsz, seqlen2, self.n_heads, self.head_dim) + xq, xk = self.q_norm2(xq), self.k_norm2(xk) + + # concat all + q, k, v = ( + torch.cat([cq, xq], dim=1), + torch.cat([ck, xk], dim=1), + torch.cat([cv, xv], dim=1), + ) + + output = optimized_attention(q.permute(0, 2, 1, 3), k.permute(0, 2, 1, 3), v.permute(0, 2, 1, 3), self.n_heads, skip_reshape=True) + + c, x = output.split([seqlen1, seqlen2], dim=1) + c = self.w1o(c) + x = self.w2o(x) + + return c, x + + +class MMDiTBlock(nn.Module): + def __init__(self, dim, heads=8, global_conddim=1024, is_last=False, dtype=None, device=None, operations=None): + super().__init__() + + self.normC1 = operations.LayerNorm(dim, elementwise_affine=False, dtype=dtype, device=device) + self.normC2 = operations.LayerNorm(dim, elementwise_affine=False, dtype=dtype, device=device) + if not is_last: + self.mlpC = MLP(dim, hidden_dim=dim * 4, dtype=dtype, device=device, operations=operations) + self.modC = nn.Sequential( + nn.SiLU(), + operations.Linear(global_conddim, 6 * dim, bias=False, dtype=dtype, device=device), + ) + else: + self.modC = nn.Sequential( + nn.SiLU(), + operations.Linear(global_conddim, 2 * dim, bias=False, dtype=dtype, device=device), + ) + + self.normX1 = operations.LayerNorm(dim, elementwise_affine=False, dtype=dtype, device=device) + self.normX2 = operations.LayerNorm(dim, elementwise_affine=False, dtype=dtype, device=device) + self.mlpX = MLP(dim, hidden_dim=dim * 4, dtype=dtype, device=device, operations=operations) + self.modX = nn.Sequential( + nn.SiLU(), + operations.Linear(global_conddim, 6 * dim, bias=False, dtype=dtype, device=device), + ) + + self.attn = DoubleAttention(dim, heads, dtype=dtype, device=device, operations=operations) + self.is_last = is_last + + #@torch.compile() + def forward(self, c, x, global_cond, **kwargs): + + cres, xres = c, x + + cshift_msa, cscale_msa, cgate_msa, cshift_mlp, cscale_mlp, cgate_mlp = ( + self.modC(global_cond).chunk(6, dim=1) + ) + + c = modulate(self.normC1(c), cshift_msa, cscale_msa) + + # xpath + xshift_msa, xscale_msa, xgate_msa, xshift_mlp, xscale_mlp, xgate_mlp = ( + self.modX(global_cond).chunk(6, dim=1) + ) + + x = modulate(self.normX1(x), xshift_msa, xscale_msa) + + # attention + c, x = self.attn(c, x) + + + c = self.normC2(cres + cgate_msa.unsqueeze(1) * c) + c = cgate_mlp.unsqueeze(1) * self.mlpC(modulate(c, cshift_mlp, cscale_mlp)) + c = cres + c + + x = self.normX2(xres + xgate_msa.unsqueeze(1) * x) + x = xgate_mlp.unsqueeze(1) * self.mlpX(modulate(x, xshift_mlp, xscale_mlp)) + x = xres + x + + return c, x + +class DiTBlock(nn.Module): + # like MMDiTBlock, but it only has X + def __init__(self, dim, heads=8, global_conddim=1024, dtype=None, device=None, operations=None): + super().__init__() + + self.norm1 = operations.LayerNorm(dim, elementwise_affine=False, dtype=dtype, device=device) + self.norm2 = operations.LayerNorm(dim, elementwise_affine=False, dtype=dtype, device=device) + + self.modCX = nn.Sequential( + nn.SiLU(), + operations.Linear(global_conddim, 6 * dim, bias=False, dtype=dtype, device=device), + ) + + self.attn = SingleAttention(dim, heads, dtype=dtype, device=device, operations=operations) + self.mlp = MLP(dim, hidden_dim=dim * 4, dtype=dtype, device=device, operations=operations) + + #@torch.compile() + def forward(self, cx, global_cond, **kwargs): + cxres = cx + shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.modCX( + global_cond + ).chunk(6, dim=1) + cx = modulate(self.norm1(cx), shift_msa, scale_msa) + cx = self.attn(cx) + cx = self.norm2(cxres + gate_msa.unsqueeze(1) * cx) + mlpout = self.mlp(modulate(cx, shift_mlp, scale_mlp)) + cx = gate_mlp.unsqueeze(1) * mlpout + + cx = cxres + cx + + return cx + + + +class TimestepEmbedder(nn.Module): + def __init__(self, hidden_size, frequency_embedding_size=256, dtype=None, device=None, operations=None): + super().__init__() + self.mlp = nn.Sequential( + operations.Linear(frequency_embedding_size, hidden_size, dtype=dtype, device=device), + nn.SiLU(), + operations.Linear(hidden_size, hidden_size, dtype=dtype, device=device), + ) + self.frequency_embedding_size = frequency_embedding_size + + @staticmethod + def timestep_embedding(t, dim, max_period=10000): + half = dim // 2 + freqs = 1000 * torch.exp( + -math.log(max_period) * torch.arange(start=0, end=half) / half + ).to(t.device) + args = t[:, None] * freqs[None] + embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1) + if dim % 2: + embedding = torch.cat( + [embedding, torch.zeros_like(embedding[:, :1])], dim=-1 + ) + return embedding + + #@torch.compile() + def forward(self, t, dtype): + t_freq = self.timestep_embedding(t, self.frequency_embedding_size).to(dtype) + t_emb = self.mlp(t_freq) + return t_emb + + +class MMDiT(nn.Module): + def __init__( + self, + in_channels=4, + out_channels=4, + patch_size=2, + dim=3072, + n_layers=36, + n_double_layers=4, + n_heads=12, + global_conddim=3072, + cond_seq_dim=2048, + max_seq=32 * 32, + device=None, + dtype=None, + operations=None, + ): + super().__init__() + self.dtype = dtype + + self.t_embedder = TimestepEmbedder(global_conddim, dtype=dtype, device=device, operations=operations) + + self.cond_seq_linear = operations.Linear( + cond_seq_dim, dim, bias=False, dtype=dtype, device=device + ) # linear for something like text sequence. + self.init_x_linear = operations.Linear( + patch_size * patch_size * in_channels, dim, dtype=dtype, device=device + ) # init linear for patchified image. + + self.positional_encoding = nn.Parameter(torch.empty(1, max_seq, dim, dtype=dtype, device=device)) + self.register_tokens = nn.Parameter(torch.empty(1, 8, dim, dtype=dtype, device=device)) + + self.double_layers = nn.ModuleList([]) + self.single_layers = nn.ModuleList([]) + + + for idx in range(n_double_layers): + self.double_layers.append( + MMDiTBlock(dim, n_heads, global_conddim, is_last=(idx == n_layers - 1), dtype=dtype, device=device, operations=operations) + ) + + for idx in range(n_double_layers, n_layers): + self.single_layers.append( + DiTBlock(dim, n_heads, global_conddim, dtype=dtype, device=device, operations=operations) + ) + + + self.final_linear = operations.Linear( + dim, patch_size * patch_size * out_channels, bias=False, dtype=dtype, device=device + ) + + self.modF = nn.Sequential( + nn.SiLU(), + operations.Linear(global_conddim, 2 * dim, bias=False, dtype=dtype, device=device), + ) + + self.out_channels = out_channels + self.patch_size = patch_size + self.n_double_layers = n_double_layers + self.n_layers = n_layers + + self.h_max = round(max_seq**0.5) + self.w_max = round(max_seq**0.5) + + @torch.no_grad() + def extend_pe(self, init_dim=(16, 16), target_dim=(64, 64)): + # extend pe + pe_data = self.positional_encoding.data.squeeze(0)[: init_dim[0] * init_dim[1]] + + pe_as_2d = pe_data.view(init_dim[0], init_dim[1], -1).permute(2, 0, 1) + + # now we need to extend this to target_dim. for this we will use interpolation. + # we will use torch.nn.functional.interpolate + pe_as_2d = F.interpolate( + pe_as_2d.unsqueeze(0), size=target_dim, mode="bilinear" + ) + pe_new = pe_as_2d.squeeze(0).permute(1, 2, 0).flatten(0, 1) + self.positional_encoding.data = pe_new.unsqueeze(0).contiguous() + self.h_max, self.w_max = target_dim + print("PE extended to", target_dim) + + def pe_selection_index_based_on_dim(self, h, w): + h_p, w_p = h // self.patch_size, w // self.patch_size + original_pe_indexes = torch.arange(self.positional_encoding.shape[1]) + original_pe_indexes = original_pe_indexes.view(self.h_max, self.w_max) + starth = self.h_max // 2 - h_p // 2 + endh =starth + h_p + startw = self.w_max // 2 - w_p // 2 + endw = startw + w_p + original_pe_indexes = original_pe_indexes[ + starth:endh, startw:endw + ] + return original_pe_indexes.flatten() + + def unpatchify(self, x, h, w): + c = self.out_channels + p = self.patch_size + + x = x.reshape(shape=(x.shape[0], h, w, p, p, c)) + x = torch.einsum("nhwpqc->nchpwq", x) + imgs = x.reshape(shape=(x.shape[0], c, h * p, w * p)) + return imgs + + def patchify(self, x): + B, C, H, W = x.size() + pad_h = (self.patch_size - H % self.patch_size) % self.patch_size + pad_w = (self.patch_size - W % self.patch_size) % self.patch_size + + x = torch.nn.functional.pad(x, (0, pad_w, 0, pad_h), mode='reflect') + x = x.view( + B, + C, + (H + 1) // self.patch_size, + self.patch_size, + (W + 1) // self.patch_size, + self.patch_size, + ) + x = x.permute(0, 2, 4, 1, 3, 5).flatten(-3).flatten(1, 2) + return x + + def apply_pos_embeds(self, x, h, w): + h = (h + 1) // self.patch_size + w = (w + 1) // self.patch_size + max_dim = max(h, w) + + cur_dim = self.h_max + pos_encoding = self.positional_encoding.reshape(1, cur_dim, cur_dim, -1).to(device=x.device, dtype=x.dtype) + + if max_dim > cur_dim: + pos_encoding = F.interpolate(pos_encoding.movedim(-1, 1), (max_dim, max_dim), mode="bilinear").movedim(1, -1) + cur_dim = max_dim + + from_h = (cur_dim - h) // 2 + from_w = (cur_dim - w) // 2 + pos_encoding = pos_encoding[:,from_h:from_h+h,from_w:from_w+w] + return x + pos_encoding.reshape(1, -1, self.positional_encoding.shape[-1]) + + def forward(self, x, timestep, context, **kwargs): + # patchify x, add PE + b, c, h, w = x.shape + + # pe_indexes = self.pe_selection_index_based_on_dim(h, w) + # print(pe_indexes, pe_indexes.shape) + + x = self.init_x_linear(self.patchify(x)) # B, T_x, D + x = self.apply_pos_embeds(x, h, w) + # x = x + self.positional_encoding[:, : x.size(1)].to(device=x.device, dtype=x.dtype) + # x = x + self.positional_encoding[:, pe_indexes].to(device=x.device, dtype=x.dtype) + + # process conditions for MMDiT Blocks + c_seq = context # B, T_c, D_c + t = timestep + + c = self.cond_seq_linear(c_seq) # B, T_c, D + c = torch.cat([self.register_tokens.to(device=c.device, dtype=c.dtype).repeat(c.size(0), 1, 1), c], dim=1) + + global_cond = self.t_embedder(t, x.dtype) # B, D + + if len(self.double_layers) > 0: + for layer in self.double_layers: + c, x = layer(c, x, global_cond, **kwargs) + + if len(self.single_layers) > 0: + c_len = c.size(1) + cx = torch.cat([c, x], dim=1) + for layer in self.single_layers: + cx = layer(cx, global_cond, **kwargs) + + x = cx[:, c_len:] + + fshift, fscale = self.modF(global_cond).chunk(2, dim=1) + + x = modulate(x, fshift, fscale) + x = self.final_linear(x) + x = self.unpatchify(x, (h + 1) // self.patch_size, (w + 1) // self.patch_size)[:,:,:h,:w] + return x diff --git a/comfy/ldm/cascade/controlnet.py b/comfy/ldm/cascade/controlnet.py index 5dac5939409a..7a52c3c263f9 100644 --- a/comfy/ldm/cascade/controlnet.py +++ b/comfy/ldm/cascade/controlnet.py @@ -90,4 +90,4 @@ def forward(self, x): proj_outputs = [None for _ in range(max(self.proj_blocks) + 1)] for i, idx in enumerate(self.proj_blocks): proj_outputs[idx] = self.projections[i](x) - return proj_outputs + return {"input": proj_outputs[::-1]} diff --git a/comfy/ldm/modules/diffusionmodules/mmdit.py b/comfy/ldm/modules/diffusionmodules/mmdit.py index 20d3a321a02a..927451534d7e 100644 --- a/comfy/ldm/modules/diffusionmodules/mmdit.py +++ b/comfy/ldm/modules/diffusionmodules/mmdit.py @@ -745,6 +745,8 @@ def __init__( qkv_bias: bool = True, context_processor_layers = None, context_size = 4096, + num_blocks = None, + final_layer = True, dtype = None, #TODO device = None, operations = None, @@ -766,7 +768,10 @@ def __init__( # apply magic --> this defines a head_size of 64 self.hidden_size = 64 * depth num_heads = depth + if num_blocks is None: + num_blocks = depth + self.depth = depth self.num_heads = num_heads self.x_embedder = PatchEmbed( @@ -821,7 +826,7 @@ def __init__( mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, attn_mode=attn_mode, - pre_only=i == depth - 1, + pre_only=(i == num_blocks - 1) and final_layer, rmsnorm=rmsnorm, scale_mod_only=scale_mod_only, swiglu=swiglu, @@ -830,11 +835,12 @@ def __init__( device=device, operations=operations ) - for i in range(depth) + for i in range(num_blocks) ] ) - self.final_layer = FinalLayer(self.hidden_size, patch_size, self.out_channels, dtype=dtype, device=device, operations=operations) + if final_layer: + self.final_layer = FinalLayer(self.hidden_size, patch_size, self.out_channels, dtype=dtype, device=device, operations=operations) if compile_core: assert False @@ -893,6 +899,7 @@ def forward_core_with_concat( x: torch.Tensor, c_mod: torch.Tensor, context: Optional[torch.Tensor] = None, + control = None, ) -> torch.Tensor: if self.register_length > 0: context = torch.cat( @@ -905,13 +912,20 @@ def forward_core_with_concat( # context is B, L', D # x is B, L, D - for block in self.joint_blocks: - context, x = block( + blocks = len(self.joint_blocks) + for i in range(blocks): + context, x = self.joint_blocks[i]( context, x, c=c_mod, use_checkpoint=self.use_checkpoint, ) + if control is not None: + control_o = control.get("output") + if i < len(control_o): + add = control_o[i] + if add is not None: + x += add x = self.final_layer(x, c_mod) # (N, T, patch_size ** 2 * out_channels) return x @@ -922,6 +936,7 @@ def forward( t: torch.Tensor, y: Optional[torch.Tensor] = None, context: Optional[torch.Tensor] = None, + control = None, ) -> torch.Tensor: """ Forward pass of DiT. @@ -943,7 +958,7 @@ def forward( if context is not None: context = self.context_embedder(context) - x = self.forward_core_with_concat(x, c, context) + x = self.forward_core_with_concat(x, c, context, control) x = self.unpatchify(x, hw=hw) # (N, out_channels, H, W) return x[:,:,:hw[-2],:hw[-1]] @@ -956,7 +971,8 @@ def forward( timesteps: torch.Tensor, context: Optional[torch.Tensor] = None, y: Optional[torch.Tensor] = None, + control = None, **kwargs, ) -> torch.Tensor: - return super().forward(x, timesteps, context=context, y=y) + return super().forward(x, timesteps, context=context, y=y, control=control) diff --git a/comfy/lora.py b/comfy/lora.py index 0374317756b4..e36b354f04fd 100644 --- a/comfy/lora.py +++ b/comfy/lora.py @@ -274,4 +274,12 @@ def model_lora_keys_unet(model, key_map={}): key_lora = "lora_transformer_{}".format(k[:-len(".weight")].replace(".", "_")) #OneTrainer lora key_map[key_lora] = to + if isinstance(model, comfy.model_base.AuraFlow): #Diffusers lora AuraFlow + diffusers_keys = comfy.utils.auraflow_to_diffusers(model.model_config.unet_config, output_prefix="diffusion_model.") + for k in diffusers_keys: + if k.endswith(".weight"): + to = diffusers_keys[k] + key_lora = "transformer.{}".format(k[:-len(".weight")]) #simpletrainer and probably regular diffusers lora format + key_map[key_lora] = to + return key_map diff --git a/comfy/model_base.py b/comfy/model_base.py index f45b375dee5a..0e0e69d3ba32 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -6,6 +6,7 @@ from comfy.ldm.modules.encoders.noise_aug_modules import CLIPEmbeddingNoiseAugmentation from comfy.ldm.modules.diffusionmodules.upscaling import ImageConcatWithNoiseAugmentation from comfy.ldm.modules.diffusionmodules.mmdit import OpenAISignatureMMDITWrapper +import comfy.ldm.aura.mmdit import comfy.ldm.audio.dit import comfy.ldm.audio.embedders import comfy.model_management @@ -598,6 +599,17 @@ def memory_required(self, input_shape): area = input_shape[0] * input_shape[2] * input_shape[3] return (area * 0.3) * (1024 * 1024) +class AuraFlow(BaseModel): + def __init__(self, model_config, model_type=ModelType.FLOW, device=None): + super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.aura.mmdit.MMDiT) + + def extra_conds(self, **kwargs): + out = super().extra_conds(**kwargs) + cross_attn = kwargs.get("cross_attn", None) + if cross_attn is not None: + out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn) + return out + class StableAudio1(BaseModel): def __init__(self, model_config, seconds_start_embedder_weights, seconds_total_embedder_weights, model_type=ModelType.V_PREDICTION_CONTINUOUS, device=None): @@ -627,3 +639,12 @@ def extra_conds(self, **kwargs): cross_attn = torch.cat([cross_attn.to(device), seconds_start_embed.repeat((cross_attn.shape[0], 1, 1)), seconds_total_embed.repeat((cross_attn.shape[0], 1, 1))], dim=1) out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn) return out + + def state_dict_for_saving(self, clip_state_dict=None, vae_state_dict=None, clip_vision_state_dict=None): + sd = super().state_dict_for_saving(clip_state_dict=clip_state_dict, vae_state_dict=vae_state_dict, clip_vision_state_dict=clip_vision_state_dict) + d = {"conditioner.conditioners.seconds_start.": self.seconds_start_embedder.state_dict(), "conditioner.conditioners.seconds_total.": self.seconds_total_embedder.state_dict()} + for k in d: + s = d[k] + for l in s: + sd["{}{}".format(k, l)] = s[l] + return sd diff --git a/comfy/model_detection.py b/comfy/model_detection.py index e09dd381ad95..c62e2b822d5c 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -41,7 +41,9 @@ def detect_unet_config(state_dict, key_prefix): unet_config["in_channels"] = state_dict['{}x_embedder.proj.weight'.format(key_prefix)].shape[1] patch_size = state_dict['{}x_embedder.proj.weight'.format(key_prefix)].shape[2] unet_config["patch_size"] = patch_size - unet_config["out_channels"] = state_dict['{}final_layer.linear.weight'.format(key_prefix)].shape[0] // (patch_size * patch_size) + final_layer = '{}final_layer.linear.weight'.format(key_prefix) + if final_layer in state_dict: + unet_config["out_channels"] = state_dict[final_layer].shape[0] // (patch_size * patch_size) unet_config["depth"] = state_dict['{}x_embedder.proj.weight'.format(key_prefix)].shape[0] // 64 unet_config["input_size"] = None @@ -103,6 +105,19 @@ def detect_unet_config(state_dict, key_prefix): unet_config["audio_model"] = "dit1.0" return unet_config + if '{}double_layers.0.attn.w1q.weight'.format(key_prefix) in state_dict_keys: #aura flow dit + unet_config = {} + unet_config["max_seq"] = state_dict['{}positional_encoding'.format(key_prefix)].shape[1] + unet_config["cond_seq_dim"] = state_dict['{}cond_seq_linear.weight'.format(key_prefix)].shape[1] + double_layers = count_blocks(state_dict_keys, '{}double_layers.'.format(key_prefix) + '{}.') + single_layers = count_blocks(state_dict_keys, '{}single_layers.'.format(key_prefix) + '{}.') + unet_config["n_double_layers"] = double_layers + unet_config["n_layers"] = double_layers + single_layers + return unet_config + + if '{}input_blocks.0.0.weight'.format(key_prefix) not in state_dict_keys: + return None + unet_config = { "use_checkpoint": False, "image_size": 32, @@ -237,6 +252,8 @@ def model_config_from_unet_config(unet_config, state_dict=None): def model_config_from_unet(state_dict, unet_key_prefix, use_base_if_no_match=False): unet_config = detect_unet_config(state_dict, unet_key_prefix) + if unet_config is None: + return None model_config = model_config_from_unet_config(unet_config, state_dict) if model_config is None and use_base_if_no_match: return comfy.supported_models_base.BASE(unet_config) @@ -246,6 +263,8 @@ def model_config_from_unet(state_dict, unet_key_prefix, use_base_if_no_match=Fal def unet_prefix_from_state_dict(state_dict): if "model.model.postprocess_conv.weight" in state_dict: #audio models unet_key_prefix = "model.model." + elif "model.double_layers.0.attn.w1q.weight" in state_dict: #aura flow + unet_key_prefix = "model." else: unet_key_prefix = "model.diffusion_model." return unet_key_prefix @@ -435,36 +454,45 @@ def model_config_from_diffusers_unet(state_dict): return None def convert_diffusers_mmdit(state_dict, output_prefix=""): - depth = count_blocks(state_dict, 'transformer_blocks.{}.') - if depth > 0: - out_sd = {} - sd_map = comfy.utils.mmdit_to_diffusers({"depth": depth}, output_prefix=output_prefix) - for k in sd_map: - weight = state_dict.get(k, None) - if weight is not None: - t = sd_map[k] - - if not isinstance(t, str): - if len(t) > 2: - fun = t[2] - else: - fun = lambda a: a - offset = t[1] - if offset is not None: - old_weight = out_sd.get(t[0], None) - if old_weight is None: - old_weight = torch.empty_like(weight) - old_weight = old_weight.repeat([3] + [1] * (len(old_weight.shape) - 1)) - - w = old_weight.narrow(offset[0], offset[1], offset[2]) - else: - old_weight = weight - w = weight - w[:] = fun(weight) - t = t[0] - out_sd[t] = old_weight + out_sd = {} + + if 'transformer_blocks.0.attn.add_q_proj.weight' in state_dict: #SD3 + num_blocks = count_blocks(state_dict, 'transformer_blocks.{}.') + depth = state_dict["pos_embed.proj.weight"].shape[0] // 64 + sd_map = comfy.utils.mmdit_to_diffusers({"depth": depth, "num_blocks": num_blocks}, output_prefix=output_prefix) + elif 'joint_transformer_blocks.0.attn.add_k_proj.weight' in state_dict: #AuraFlow + num_joint = count_blocks(state_dict, 'joint_transformer_blocks.{}.') + num_single = count_blocks(state_dict, 'single_transformer_blocks.{}.') + sd_map = comfy.utils.auraflow_to_diffusers({"n_double_layers": num_joint, "n_layers": num_joint + num_single}, output_prefix=output_prefix) + else: + return None + + for k in sd_map: + weight = state_dict.get(k, None) + if weight is not None: + t = sd_map[k] + + if not isinstance(t, str): + if len(t) > 2: + fun = t[2] else: - out_sd[t] = weight - state_dict.pop(k) + fun = lambda a: a + offset = t[1] + if offset is not None: + old_weight = out_sd.get(t[0], None) + if old_weight is None: + old_weight = torch.empty_like(weight) + old_weight = old_weight.repeat([3] + [1] * (len(old_weight.shape) - 1)) + + w = old_weight.narrow(offset[0], offset[1], offset[2]) + else: + old_weight = weight + w = weight + w[:] = fun(weight) + t = t[0] + out_sd[t] = old_weight + else: + out_sd[t] = weight + state_dict.pop(k) return out_sd diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index b949031e9986..efac251ca90a 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -57,6 +57,12 @@ def set_model_options_post_cfg_function(model_options, post_cfg_function, disabl model_options["disable_cfg1_optimization"] = True return model_options +def set_model_options_pre_cfg_function(model_options, pre_cfg_function, disable_cfg1_optimization=False): + model_options["sampler_pre_cfg_function"] = model_options.get("sampler_pre_cfg_function", []) + [pre_cfg_function] + if disable_cfg1_optimization: + model_options["disable_cfg1_optimization"] = True + return model_options + class ModelPatcher: def __init__(self, model, load_device, offload_device, size=0, current_device=None, weight_inplace_update=False): self.size = size @@ -130,6 +136,9 @@ def set_model_sampler_cfg_function(self, sampler_cfg_function, disable_cfg1_opti def set_model_sampler_post_cfg_function(self, post_cfg_function, disable_cfg1_optimization=False): self.model_options = set_model_options_post_cfg_function(self.model_options, post_cfg_function, disable_cfg1_optimization) + def set_model_sampler_pre_cfg_function(self, pre_cfg_function, disable_cfg1_optimization=False): + self.model_options = set_model_options_pre_cfg_function(self.model_options, pre_cfg_function, disable_cfg1_optimization) + def set_model_unet_function_wrapper(self, unet_wrapper_function: UnetWrapperFunction): self.model_options["model_function_wrapper"] = unet_wrapper_function diff --git a/comfy/model_sampling.py b/comfy/model_sampling.py index 6bd3a5d79a5a..25bb7e043b62 100644 --- a/comfy/model_sampling.py +++ b/comfy/model_sampling.py @@ -59,8 +59,9 @@ def __init__(self, model_config=None): beta_schedule = sampling_settings.get("beta_schedule", "linear") linear_start = sampling_settings.get("linear_start", 0.00085) linear_end = sampling_settings.get("linear_end", 0.012) + timesteps = sampling_settings.get("timesteps", 1000) - self._register_schedule(given_betas=None, beta_schedule=beta_schedule, timesteps=1000, linear_start=linear_start, linear_end=linear_end, cosine_s=8e-3) + self._register_schedule(given_betas=None, beta_schedule=beta_schedule, timesteps=timesteps, linear_start=linear_start, linear_end=linear_end, cosine_s=8e-3) self.sigma_data = 1.0 def _register_schedule(self, given_betas=None, beta_schedule="linear", timesteps=1000, @@ -190,11 +191,12 @@ def __init__(self, model_config=None): else: sampling_settings = {} - self.set_parameters(shift=sampling_settings.get("shift", 1.0)) + self.set_parameters(shift=sampling_settings.get("shift", 1.0), multiplier=sampling_settings.get("multiplier", 1000)) - def set_parameters(self, shift=1.0, timesteps=1000): + def set_parameters(self, shift=1.0, timesteps=1000, multiplier=1000): self.shift = shift - ts = self.sigma(torch.arange(1, timesteps + 1, 1)) + self.multiplier = multiplier + ts = self.sigma((torch.arange(1, timesteps + 1, 1) / timesteps) * multiplier) self.register_buffer('sigmas', ts) @property @@ -206,10 +208,10 @@ def sigma_max(self): return self.sigmas[-1] def timestep(self, sigma): - return sigma * 1000 + return sigma * self.multiplier def sigma(self, timestep): - return time_snr_shift(self.shift, timestep / 1000) + return time_snr_shift(self.shift, timestep / self.multiplier) def percent_to_sigma(self, percent): if percent <= 0.0: diff --git a/comfy/samplers.py b/comfy/samplers.py index 7f7114dbb19f..3f763381412e 100644 --- a/comfy/samplers.py +++ b/comfy/samplers.py @@ -6,6 +6,8 @@ import math import logging import comfy.sampler_helpers +import scipy +import numpy def get_area_and_mult(conds, x_in, timestep_in): dims = tuple(x_in.shape[2:]) @@ -275,6 +277,12 @@ def sampling_function(model, x, timestep, uncond, cond, cond_scale, model_option conds = [cond, uncond_] out = calc_cond_batch(model, conds, x, timestep, model_options) + + for fn in model_options.get("sampler_pre_cfg_function", []): + args = {"conds":conds, "conds_out": out, "cond_scale": cond_scale, "timestep": timestep, + "input": x, "sigma": timestep, "model": model, "model_options": model_options} + out = fn(args) + return cfg_function(model, out[0], out[1], cond_scale, x, timestep, model_options=model_options, cond=cond, uncond=uncond_) @@ -305,13 +313,18 @@ def simple_scheduler(model_sampling, steps): def ddim_scheduler(model_sampling, steps): s = model_sampling sigs = [] - ss = max(len(s.sigmas) // steps, 1) x = 1 + if math.isclose(float(s.sigmas[x]), 0, abs_tol=0.00001): + steps += 1 + sigs = [] + else: + sigs = [0.0] + + ss = max(len(s.sigmas) // steps, 1) while x < len(s.sigmas): sigs += [float(s.sigmas[x])] x += ss sigs = sigs[::-1] - sigs += [0.0] return torch.FloatTensor(sigs) def normal_scheduler(model_sampling, steps, sgm=False, floor=False): @@ -319,15 +332,34 @@ def normal_scheduler(model_sampling, steps, sgm=False, floor=False): start = s.timestep(s.sigma_max) end = s.timestep(s.sigma_min) + append_zero = True if sgm: timesteps = torch.linspace(start, end, steps + 1)[:-1] else: + if math.isclose(float(s.sigma(end)), 0, abs_tol=0.00001): + steps += 1 + append_zero = False timesteps = torch.linspace(start, end, steps) sigs = [] for x in range(len(timesteps)): ts = timesteps[x] - sigs.append(s.sigma(ts)) + sigs.append(float(s.sigma(ts))) + + if append_zero: + sigs += [0.0] + + return torch.FloatTensor(sigs) + +# Implemented based on: https://arxiv.org/abs/2407.12173 +def beta_scheduler(model_sampling, steps, alpha=0.6, beta=0.6): + total_timesteps = (len(model_sampling.sigmas) - 1) + ts = 1 - numpy.linspace(0, 1, steps, endpoint=False) + ts = numpy.rint(scipy.stats.beta.ppf(ts, alpha, beta) * total_timesteps) + + sigs = [] + for t in ts: + sigs += [float(model_sampling.sigmas[int(t)])] sigs += [0.0] return torch.FloatTensor(sigs) @@ -537,7 +569,7 @@ def max_denoise(self, model_wrap, sigmas): sigma = float(sigmas[0]) return math.isclose(max_sigma, sigma, rel_tol=1e-05) or sigma > max_sigma -KSAMPLER_NAMES = ["euler", "euler_pp", "euler_ancestral", "euler_ancestral_pp", "heun", "heunpp2","dpm_2", "dpm_2_ancestral", +KSAMPLER_NAMES = ["euler", "euler_cfg_pp", "euler_ancestral", "euler_ancestral_cfg_pp", "heun", "heunpp2","dpm_2", "dpm_2_ancestral", "lms", "dpm_fast", "dpm_adaptive", "dpmpp_2s_ancestral", "dpmpp_sde", "dpmpp_sde_gpu", "dpmpp_2m", "dpmpp_2m_sde", "dpmpp_2m_sde_gpu", "dpmpp_3m_sde", "dpmpp_3m_sde_gpu", "ddpm", "lcm", "ipndm", "ipndm_v", "deis"] @@ -697,7 +729,7 @@ def sample(model, noise, positive, negative, cfg, device, sampler, sigmas, model return cfg_guider.sample(noise, latent_image, sampler, sigmas, denoise_mask, callback, disable_pbar, seed) -SCHEDULER_NAMES = ["normal", "karras", "exponential", "sgm_uniform", "simple", "ddim_uniform"] +SCHEDULER_NAMES = ["normal", "karras", "exponential", "sgm_uniform", "simple", "ddim_uniform", "beta"] SAMPLER_NAMES = KSAMPLER_NAMES + ["ddim", "uni_pc", "uni_pc_bh2"] def calculate_sigmas(model_sampling, scheduler_name, steps): @@ -713,6 +745,8 @@ def calculate_sigmas(model_sampling, scheduler_name, steps): sigmas = ddim_scheduler(model_sampling, steps) elif scheduler_name == "sgm_uniform": sigmas = normal_scheduler(model_sampling, steps, sgm=True) + elif scheduler_name == "beta": + sigmas = beta_scheduler(model_sampling, steps) else: logging.error("error invalid scheduler {}".format(scheduler_name)) return sigmas diff --git a/comfy/sd.py b/comfy/sd.py index ea6e9b663a39..17df5faffc63 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -19,8 +19,9 @@ from . import sd1_clip from . import sd2_clip from . import sdxl_clip -from . import sd3_clip -from . import sa_t5 +import comfy.text_encoders.sd3_clip +import comfy.text_encoders.sa_t5 +import comfy.text_encoders.aura_t5 import comfy.model_patcher import comfy.lora @@ -28,36 +29,6 @@ import comfy.supported_models_base import comfy.taesd.taesd -def load_model_weights(model, sd): - m, u = model.load_state_dict(sd, strict=False) - m = set(m) - unexpected_keys = set(u) - - k = list(sd.keys()) - for x in k: - if x not in unexpected_keys: - w = sd.pop(x) - del w - if len(m) > 0: - logging.warning("missing {}".format(m)) - return model - -def load_clip_weights(model, sd): - k = list(sd.keys()) - for x in k: - if x.startswith("cond_stage_model.transformer.") and not x.startswith("cond_stage_model.transformer.text_model."): - y = x.replace("cond_stage_model.transformer.", "cond_stage_model.transformer.text_model.") - sd[y] = sd.pop(x) - - if 'cond_stage_model.transformer.text_model.embeddings.position_ids' in sd: - ids = sd['cond_stage_model.transformer.text_model.embeddings.position_ids'] - if ids.dtype == torch.float32: - sd['cond_stage_model.transformer.text_model.embeddings.position_ids'] = ids.round() - - sd = comfy.utils.clip_text_transformers_convert(sd, "cond_stage_model.model.", "cond_stage_model.transformer.") - return load_model_weights(model, sd) - - def load_lora_for_models(model, clip, lora, strength_model, strength_clip): key_map = {} if model is not None: @@ -130,7 +101,7 @@ def clip_layer(self, layer_idx): def tokenize(self, text, return_word_ids=False): return self.tokenizer.tokenize_with_weights(text, return_word_ids) - def encode_from_tokens(self, tokens, return_pooled=False): + def encode_from_tokens(self, tokens, return_pooled=False, return_dict=False): self.cond_stage_model.reset_clip_options() if self.layer_idx is not None: @@ -140,7 +111,15 @@ def encode_from_tokens(self, tokens, return_pooled=False): self.cond_stage_model.set_clip_options({"projected_pooled": False}) self.load_model() - cond, pooled = self.cond_stage_model.encode_token_weights(tokens) + o = self.cond_stage_model.encode_token_weights(tokens) + cond, pooled = o[:2] + if return_dict: + out = {"cond": cond, "pooled_output": pooled} + if len(o) > 2: + for k in o[2]: + out[k] = o[2][k] + return out + if return_pooled: return cond, pooled return cond @@ -236,7 +215,7 @@ def __init__(self, sd=None, device=None, config=None, dtype=None): self.first_stage_model = AutoencodingEngine(regularizer_config={'target': "comfy.ldm.models.autoencoder.DiagonalGaussianRegularizer"}, encoder_config={'target': "comfy.ldm.modules.diffusionmodules.model.Encoder", 'params': ddconfig}, decoder_config={'target': "comfy.ldm.modules.diffusionmodules.model.Decoder", 'params': ddconfig}) - elif "decoder.layers.0.weight_v" in sd: + elif "decoder.layers.1.layers.0.beta" in sd: self.first_stage_model = AudioOobleckVAE() self.memory_used_encode = lambda shape, dtype: (1000 * shape[2]) * model_management.dtype_size(dtype) self.memory_used_decode = lambda shape, dtype: (1000 * shape[2] * 2048) * model_management.dtype_size(dtype) @@ -432,25 +411,30 @@ class EmptyClass: clip_target.clip = sd2_clip.SD2ClipModel clip_target.tokenizer = sd2_clip.SD2Tokenizer elif "encoder.block.23.layer.1.DenseReluDense.wi_1.weight" in clip_data[0]: - dtype_t5 = clip_data[0]["encoder.block.23.layer.1.DenseReluDense.wi_1.weight"].dtype - clip_target.clip = sd3_clip.sd3_clip(clip_l=False, clip_g=False, t5=True, dtype_t5=dtype_t5) - clip_target.tokenizer = sd3_clip.SD3Tokenizer + weight = clip_data[0]["encoder.block.23.layer.1.DenseReluDense.wi_1.weight"] + dtype_t5 = weight.dtype + if weight.shape[-1] == 4096: + clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(clip_l=False, clip_g=False, t5=True, dtype_t5=dtype_t5) + clip_target.tokenizer = comfy.text_encoders.sd3_clip.SD3Tokenizer + elif weight.shape[-1] == 2048: + clip_target.clip = comfy.text_encoders.aura_t5.AuraT5Model + clip_target.tokenizer = comfy.text_encoders.aura_t5.AuraT5Tokenizer elif "encoder.block.0.layer.0.SelfAttention.k.weight" in clip_data[0]: - clip_target.clip = sa_t5.SAT5Model - clip_target.tokenizer = sa_t5.SAT5Tokenizer + clip_target.clip = comfy.text_encoders.sa_t5.SAT5Model + clip_target.tokenizer = comfy.text_encoders.sa_t5.SAT5Tokenizer else: clip_target.clip = sd1_clip.SD1ClipModel clip_target.tokenizer = sd1_clip.SD1Tokenizer elif len(clip_data) == 2: if clip_type == CLIPType.SD3: - clip_target.clip = sd3_clip.sd3_clip(clip_l=True, clip_g=True, t5=False) - clip_target.tokenizer = sd3_clip.SD3Tokenizer + clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(clip_l=True, clip_g=True, t5=False) + clip_target.tokenizer = comfy.text_encoders.sd3_clip.SD3Tokenizer else: clip_target.clip = sdxl_clip.SDXLClipModel clip_target.tokenizer = sdxl_clip.SDXLTokenizer elif len(clip_data) == 3: - clip_target.clip = sd3_clip.SD3ClipModel - clip_target.tokenizer = sd3_clip.SD3Tokenizer + clip_target.clip = comfy.text_encoders.sd3_clip.SD3ClipModel + clip_target.tokenizer = comfy.text_encoders.sd3_clip.SD3Tokenizer clip = CLIP(clip_target, embedding_directory=embedding_directory) for c in clip_data: @@ -509,13 +493,13 @@ def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, o load_device = model_management.get_torch_device() model_config = model_detection.model_config_from_unet(sd, diffusion_model_prefix) + if model_config is None: + raise RuntimeError("ERROR: Could not detect model type of: {}".format(ckpt_path)) + unet_dtype = model_management.unet_dtype(model_params=parameters, supported_dtypes=model_config.supported_inference_dtypes) manual_cast_dtype = model_management.unet_manual_cast(unet_dtype, load_device, model_config.supported_inference_dtypes) model_config.set_inference_dtype(unet_dtype, manual_cast_dtype) - if model_config is None: - raise RuntimeError("ERROR: Could not detect model type of: {}".format(ckpt_path)) - if model_config.clip_vision_prefix is not None: if output_clipvision: clipvision = clip_vision.load_clipvision_from_sd(sd, model_config.clip_vision_prefix, True) @@ -563,37 +547,40 @@ def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, o return (model_patcher, clip, vae, clipvision) -def load_unet_state_dict(sd): #load unet in diffusers format +def load_unet_state_dict(sd): #load unet in diffusers or regular format + + #Allow loading unets from checkpoint files + diffusion_model_prefix = model_detection.unet_prefix_from_state_dict(sd) + temp_sd = comfy.utils.state_dict_prefix_replace(sd, {diffusion_model_prefix: ""}, filter_keys=True) + if len(temp_sd) > 0: + sd = temp_sd + parameters = comfy.utils.calculate_parameters(sd) unet_dtype = model_management.unet_dtype(model_params=parameters) load_device = model_management.get_torch_device() + model_config = model_detection.model_config_from_unet(sd, "") - if 'transformer_blocks.0.attn.add_q_proj.weight' in sd: #MMDIT SD3 - new_sd = model_detection.convert_diffusers_mmdit(sd, "") - if new_sd is None: - return None - model_config = model_detection.model_config_from_unet(new_sd, "") - if model_config is None: - return None - elif "input_blocks.0.0.weight" in sd or 'clf.1.weight' in sd: #ldm or stable cascade - model_config = model_detection.model_config_from_unet(sd, "") - if model_config is None: - return None + if model_config is not None: new_sd = sd - - else: #diffusers - model_config = model_detection.model_config_from_diffusers_unet(sd) - if model_config is None: - return None - - diffusers_keys = comfy.utils.unet_to_diffusers(model_config.unet_config) - - new_sd = {} - for k in diffusers_keys: - if k in sd: - new_sd[diffusers_keys[k]] = sd.pop(k) - else: - logging.warning("{} {}".format(diffusers_keys[k], k)) + else: + new_sd = model_detection.convert_diffusers_mmdit(sd, "") + if new_sd is not None: #diffusers mmdit + model_config = model_detection.model_config_from_unet(new_sd, "") + if model_config is None: + return None + else: #diffusers unet + model_config = model_detection.model_config_from_diffusers_unet(sd) + if model_config is None: + return None + + diffusers_keys = comfy.utils.unet_to_diffusers(model_config.unet_config) + + new_sd = {} + for k in diffusers_keys: + if k in sd: + new_sd[diffusers_keys[k]] = sd.pop(k) + else: + logging.warning("{} {}".format(diffusers_keys[k], k)) offload_device = model_management.unet_offload_device() unet_dtype = model_management.unet_dtype(model_params=parameters, supported_dtypes=model_config.supported_inference_dtypes) @@ -628,4 +615,9 @@ def save_checkpoint(output_path, model, clip=None, vae=None, clip_vision=None, m for k in extra_keys: sd[k] = extra_keys[k] + for k in sd: + t = sd[k] + if not t.is_contiguous(): + sd[k] = t.contiguous() + comfy.utils.save_torch_file(sd, output_path, metadata=metadata) diff --git a/comfy/sd1_clip.py b/comfy/sd1_clip.py index 78e556b568f7..565ad69dade6 100644 --- a/comfy/sd1_clip.py +++ b/comfy/sd1_clip.py @@ -38,7 +38,9 @@ def encode_token_weights(self, token_weight_pairs): if has_weights or sections == 0: to_encode.append(gen_empty_tokens(self.special_tokens, max_token_len)) - out, pooled = self.encode(to_encode) + o = self.encode(to_encode) + out, pooled = o[:2] + if pooled is not None: first_pooled = pooled[0:1].to(model_management.intermediate_device()) else: @@ -57,8 +59,20 @@ def encode_token_weights(self, token_weight_pairs): output.append(z) if (len(output) == 0): - return out[-1:].to(model_management.intermediate_device()), first_pooled - return torch.cat(output, dim=-2).to(model_management.intermediate_device()), first_pooled + r = (out[-1:].to(model_management.intermediate_device()), first_pooled) + else: + r = (torch.cat(output, dim=-2).to(model_management.intermediate_device()), first_pooled) + + if len(o) > 2: + extra = {} + for k in o[2]: + v = o[2][k] + if k == "attention_mask": + v = v[:sections].flatten().unsqueeze(dim=0).to(model_management.intermediate_device()) + extra[k] = v + + r = r + (extra,) + return r class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder): """Uses the CLIP transformer encoder for text (from huggingface)""" @@ -70,7 +84,7 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder): def __init__(self, version="openai/clip-vit-large-patch14", device="cpu", max_length=77, freeze=True, layer="last", layer_idx=None, textmodel_json_config=None, dtype=None, model_class=comfy.clip_model.CLIPTextModel, special_tokens={"start": 49406, "end": 49407, "pad": 49407}, layer_norm_hidden_state=True, enable_attention_masks=False, zero_out_masked=False, - return_projected_pooled=True): # clip-vit-base-patch32 + return_projected_pooled=True, return_attention_masks=False): # clip-vit-base-patch32 super().__init__() assert layer in self.LAYERS @@ -96,6 +110,7 @@ def __init__(self, version="openai/clip-vit-large-patch14", device="cpu", max_le self.layer_norm_hidden_state = layer_norm_hidden_state self.return_projected_pooled = return_projected_pooled + self.return_attention_masks = return_attention_masks if layer == "hidden": assert layer_idx is not None @@ -169,7 +184,7 @@ def forward(self, tokens): tokens = torch.LongTensor(tokens).to(device) attention_mask = None - if self.enable_attention_masks: + if self.enable_attention_masks or self.zero_out_masked or self.return_attention_masks: attention_mask = torch.zeros_like(tokens) end_token = self.special_tokens.get("end", -1) for x in range(attention_mask.shape[0]): @@ -178,7 +193,11 @@ def forward(self, tokens): if tokens[x, y] == end_token: break - outputs = self.transformer(tokens, attention_mask, intermediate_output=self.layer_idx, final_layer_norm_intermediate=self.layer_norm_hidden_state) + attention_mask_model = None + if self.enable_attention_masks: + attention_mask_model = attention_mask + + outputs = self.transformer(tokens, attention_mask_model, intermediate_output=self.layer_idx, final_layer_norm_intermediate=self.layer_norm_hidden_state) self.transformer.set_input_embeddings(backup_embeds) if self.layer == "last": @@ -186,7 +205,7 @@ def forward(self, tokens): else: z = outputs[1].float() - if self.zero_out_masked and attention_mask is not None: + if self.zero_out_masked: z *= attention_mask.unsqueeze(-1).float() pooled_output = None @@ -196,6 +215,13 @@ def forward(self, tokens): elif outputs[2] is not None: pooled_output = outputs[2].float() + extra = {} + if self.return_attention_masks: + extra["attention_mask"] = attention_mask + + if len(extra) > 0: + return z, pooled_output, extra + return z, pooled_output def encode(self, tokens): @@ -360,7 +386,7 @@ def load_embed(embedding_name, embedding_directory, embedding_size, embed_key=No return embed_out class SDTokenizer: - def __init__(self, tokenizer_path=None, max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=CLIPTokenizer, has_start_token=True, pad_to_max_length=True, min_length=None): + def __init__(self, tokenizer_path=None, max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=CLIPTokenizer, has_start_token=True, pad_to_max_length=True, min_length=None, pad_token=None): if tokenizer_path is None: tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "sd1_tokenizer") self.tokenizer = tokenizer_class.from_pretrained(tokenizer_path) @@ -376,6 +402,14 @@ def __init__(self, tokenizer_path=None, max_length=77, pad_with_end=True, embedd self.tokens_start = 0 self.start_token = None self.end_token = empty[0] + + if pad_token is not None: + self.pad_token = pad_token + elif pad_with_end: + self.pad_token = self.end_token + else: + self.pad_token = 0 + self.pad_with_end = pad_with_end self.pad_to_max_length = pad_to_max_length @@ -408,10 +442,6 @@ def tokenize_with_weights(self, text:str, return_word_ids=False): Word id values are unique per word and embedding, where the id 0 is reserved for non word tokens. Returned list has the dimensions NxM where M is the input size of CLIP ''' - if self.pad_with_end: - pad_token = self.end_token - else: - pad_token = 0 text = escape_important(text) parsed_weights = token_weights(text, 1.0) @@ -463,7 +493,7 @@ def tokenize_with_weights(self, text:str, return_word_ids=False): else: batch.append((self.end_token, 1.0, 0)) if self.pad_to_max_length: - batch.extend([(pad_token, 1.0, 0)] * (remaining_length)) + batch.extend([(self.pad_token, 1.0, 0)] * (remaining_length)) #start new batch batch = [] if self.start_token is not None: @@ -476,9 +506,9 @@ def tokenize_with_weights(self, text:str, return_word_ids=False): #fill last batch batch.append((self.end_token, 1.0, 0)) if self.pad_to_max_length: - batch.extend([(pad_token, 1.0, 0)] * (self.max_length - len(batch))) + batch.extend([(self.pad_token, 1.0, 0)] * (self.max_length - len(batch))) if self.min_length is not None and len(batch) < self.min_length: - batch.extend([(pad_token, 1.0, 0)] * (self.min_length - len(batch))) + batch.extend([(self.pad_token, 1.0, 0)] * (self.min_length - len(batch))) if not return_word_ids: batched_tokens = [[(t, w) for t, w,_ in x] for x in batched_tokens] @@ -506,10 +536,16 @@ def untokenize(self, token_weight_pair): class SD1ClipModel(torch.nn.Module): - def __init__(self, device="cpu", dtype=None, clip_name="l", clip_model=SDClipModel, **kwargs): + def __init__(self, device="cpu", dtype=None, clip_name="l", clip_model=SDClipModel, name=None, **kwargs): super().__init__() - self.clip_name = clip_name - self.clip = "clip_{}".format(self.clip_name) + + if name is not None: + self.clip_name = name + self.clip = "{}".format(self.clip_name) + else: + self.clip_name = clip_name + self.clip = "clip_{}".format(self.clip_name) + setattr(self, self.clip, clip_model(device=device, dtype=dtype, **kwargs)) self.dtypes = set() @@ -524,8 +560,8 @@ def reset_clip_options(self): def encode_token_weights(self, token_weight_pairs): token_weight_pairs = token_weight_pairs[self.clip_name] - out, pooled = getattr(self, self.clip).encode_token_weights(token_weight_pairs) - return out, pooled + out = getattr(self, self.clip).encode_token_weights(token_weight_pairs) + return out def load_sd(self, sd): return getattr(self, self.clip).load_sd(sd) diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 761498dbc9e5..b4d1059ef3f8 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -5,8 +5,9 @@ from . import sd1_clip from . import sd2_clip from . import sdxl_clip -from . import sd3_clip -from . import sa_t5 +import comfy.text_encoders.sd3_clip +import comfy.text_encoders.sa_t5 +import comfy.text_encoders.aura_t5 from . import supported_models_base from . import latent_formats @@ -523,7 +524,7 @@ def clip_target(self, state_dict={}): t5 = True dtype_t5 = state_dict[t5_key].dtype - return supported_models_base.ClipTarget(sd3_clip.SD3Tokenizer, sd3_clip.sd3_clip(clip_l=clip_l, clip_g=clip_g, t5=t5, dtype_t5=dtype_t5)) + return supported_models_base.ClipTarget(comfy.text_encoders.sd3_clip.SD3Tokenizer, comfy.text_encoders.sd3_clip.sd3_clip(clip_l=clip_l, clip_g=clip_g, t5=t5, dtype_t5=dtype_t5)) class StableAudio(supported_models_base.BASE): unet_config = { @@ -543,17 +544,42 @@ def get_model(self, state_dict, prefix="", device=None): seconds_total_sd = utils.state_dict_prefix_replace(state_dict, {"conditioner.conditioners.seconds_total.": ""}, filter_keys=True) return model_base.StableAudio1(self, seconds_start_embedder_weights=seconds_start_sd, seconds_total_embedder_weights=seconds_total_sd, device=device) - def process_unet_state_dict(self, state_dict): for k in list(state_dict.keys()): if k.endswith(".cross_attend_norm.beta") or k.endswith(".ff_norm.beta") or k.endswith(".pre_norm.beta"): #These weights are all zero state_dict.pop(k) return state_dict + def process_unet_state_dict_for_saving(self, state_dict): + replace_prefix = {"": "model.model."} + return utils.state_dict_prefix_replace(state_dict, replace_prefix) + def clip_target(self, state_dict={}): - return supported_models_base.ClipTarget(sa_t5.SAT5Tokenizer, sa_t5.SAT5Model) + return supported_models_base.ClipTarget(comfy.text_encoders.sa_t5.SAT5Tokenizer, comfy.text_encoders.sa_t5.SAT5Model) + +class AuraFlow(supported_models_base.BASE): + unet_config = { + "cond_seq_dim": 2048, + } + sampling_settings = { + "multiplier": 1.0, + "shift": 1.73, + } + + unet_extra_config = {} + latent_format = latent_formats.SDXL + + vae_key_prefix = ["vae."] + text_encoder_key_prefix = ["text_encoders."] + + def get_model(self, state_dict, prefix="", device=None): + out = model_base.AuraFlow(self, device=device) + return out + + def clip_target(self, state_dict={}): + return supported_models_base.ClipTarget(comfy.text_encoders.aura_t5.AuraT5Tokenizer, comfy.text_encoders.aura_t5.AuraT5Model) -models = [Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio] +models = [Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow] models += [SVD_img2vid] diff --git a/comfy/t2i_adapter/adapter.py b/comfy/t2i_adapter/adapter.py index e9a606b1cd67..10ea18e32669 100644 --- a/comfy/t2i_adapter/adapter.py +++ b/comfy/t2i_adapter/adapter.py @@ -153,7 +153,13 @@ def forward(self, x): features.append(None) features.append(x) - return features + features = features[::-1] + + if self.xl: + return {"input": features[1:], "middle": features[:1]} + else: + return {"input": features} + class LayerNorm(nn.LayerNorm): @@ -290,4 +296,4 @@ def forward(self, x): features.append(None) features.append(x) - return features + return {"input": features[::-1]} diff --git a/comfy/text_encoders/aura_t5.py b/comfy/text_encoders/aura_t5.py new file mode 100644 index 000000000000..6b9e4fe537ca --- /dev/null +++ b/comfy/text_encoders/aura_t5.py @@ -0,0 +1,22 @@ +from comfy import sd1_clip +from .llama_tokenizer import LLAMATokenizer +import comfy.text_encoders.t5 +import os + +class PT5XlModel(sd1_clip.SDClipModel): + def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None): + textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_pile_config_xl.json") + super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"end": 2, "pad": 1}, model_class=comfy.text_encoders.t5.T5, enable_attention_masks=True, zero_out_masked=True) + +class PT5XlTokenizer(sd1_clip.SDTokenizer): + def __init__(self, embedding_directory=None): + tokenizer_path = os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_pile_tokenizer"), "tokenizer.model") + super().__init__(tokenizer_path, pad_with_end=False, embedding_size=2048, embedding_key='pile_t5xl', tokenizer_class=LLAMATokenizer, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=256, pad_token=1) + +class AuraT5Tokenizer(sd1_clip.SD1Tokenizer): + def __init__(self, embedding_directory=None): + super().__init__(embedding_directory=embedding_directory, clip_name="pile_t5xl", tokenizer=PT5XlTokenizer) + +class AuraT5Model(sd1_clip.SD1ClipModel): + def __init__(self, device="cpu", dtype=None, **kwargs): + super().__init__(device=device, dtype=dtype, name="pile_t5xl", clip_model=PT5XlModel, **kwargs) diff --git a/comfy/text_encoders/llama_tokenizer.py b/comfy/text_encoders/llama_tokenizer.py new file mode 100644 index 000000000000..a6db1da629cf --- /dev/null +++ b/comfy/text_encoders/llama_tokenizer.py @@ -0,0 +1,22 @@ +import os + +class LLAMATokenizer: + @staticmethod + def from_pretrained(path): + return LLAMATokenizer(path) + + def __init__(self, tokenizer_path): + import sentencepiece + self.tokenizer = sentencepiece.SentencePieceProcessor(model_file=tokenizer_path) + self.end = self.tokenizer.eos_id() + + def get_vocab(self): + out = {} + for i in range(self.tokenizer.get_piece_size()): + out[self.tokenizer.id_to_piece(i)] = i + return out + + def __call__(self, string): + out = self.tokenizer.encode(string) + out += [self.end] + return {"input_ids": out} diff --git a/comfy/sa_t5.py b/comfy/text_encoders/sa_t5.py similarity index 81% rename from comfy/sa_t5.py rename to comfy/text_encoders/sa_t5.py index 37be5287e22d..038558e7aa5a 100644 --- a/comfy/sa_t5.py +++ b/comfy/text_encoders/sa_t5.py @@ -1,12 +1,12 @@ from comfy import sd1_clip from transformers import T5TokenizerFast -import comfy.t5 +import comfy.text_encoders.t5 import os class T5BaseModel(sd1_clip.SDClipModel): def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None): textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_config_base.json") - super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"end": 1, "pad": 0}, model_class=comfy.t5.T5, enable_attention_masks=True, zero_out_masked=True) + super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"end": 1, "pad": 0}, model_class=comfy.text_encoders.t5.T5, enable_attention_masks=True, zero_out_masked=True) class T5BaseTokenizer(sd1_clip.SDTokenizer): def __init__(self, embedding_directory=None): @@ -19,4 +19,4 @@ def __init__(self, embedding_directory=None): class SAT5Model(sd1_clip.SD1ClipModel): def __init__(self, device="cpu", dtype=None, **kwargs): - super().__init__(device=device, dtype=dtype, clip_name="t5base", clip_model=T5BaseModel, **kwargs) + super().__init__(device=device, dtype=dtype, name="t5base", clip_model=T5BaseModel, **kwargs) diff --git a/comfy/sd3_clip.py b/comfy/text_encoders/sd3_clip.py similarity index 98% rename from comfy/sd3_clip.py rename to comfy/text_encoders/sd3_clip.py index 0713eb285294..70127e50975e 100644 --- a/comfy/sd3_clip.py +++ b/comfy/text_encoders/sd3_clip.py @@ -1,7 +1,7 @@ from comfy import sd1_clip from comfy import sdxl_clip from transformers import T5TokenizerFast -import comfy.t5 +import comfy.text_encoders.t5 import torch import os import comfy.model_management @@ -10,7 +10,7 @@ class T5XXLModel(sd1_clip.SDClipModel): def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None): textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_config_xxl.json") - super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"end": 1, "pad": 0}, model_class=comfy.t5.T5) + super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"end": 1, "pad": 0}, model_class=comfy.text_encoders.t5.T5) class T5XXLTokenizer(sd1_clip.SDTokenizer): def __init__(self, embedding_directory=None): diff --git a/comfy/t5.py b/comfy/text_encoders/t5.py similarity index 90% rename from comfy/t5.py rename to comfy/text_encoders/t5.py index 06dfe47668e6..448c5aad3e0d 100644 --- a/comfy/t5.py +++ b/comfy/text_encoders/t5.py @@ -13,29 +13,36 @@ def forward(self, x): x = x * torch.rsqrt(variance + self.variance_epsilon) return self.weight.to(device=x.device, dtype=x.dtype) * x +activations = { + "gelu_pytorch_tanh": lambda a: torch.nn.functional.gelu(a, approximate="tanh"), + "relu": torch.nn.functional.relu, +} + class T5DenseActDense(torch.nn.Module): - def __init__(self, model_dim, ff_dim, dtype, device, operations): + def __init__(self, model_dim, ff_dim, ff_activation, dtype, device, operations): super().__init__() self.wi = operations.Linear(model_dim, ff_dim, bias=False, dtype=dtype, device=device) self.wo = operations.Linear(ff_dim, model_dim, bias=False, dtype=dtype, device=device) # self.dropout = nn.Dropout(config.dropout_rate) + self.act = activations[ff_activation] def forward(self, x): - x = torch.nn.functional.relu(self.wi(x)) + x = self.act(self.wi(x)) # x = self.dropout(x) x = self.wo(x) return x class T5DenseGatedActDense(torch.nn.Module): - def __init__(self, model_dim, ff_dim, dtype, device, operations): + def __init__(self, model_dim, ff_dim, ff_activation, dtype, device, operations): super().__init__() self.wi_0 = operations.Linear(model_dim, ff_dim, bias=False, dtype=dtype, device=device) self.wi_1 = operations.Linear(model_dim, ff_dim, bias=False, dtype=dtype, device=device) self.wo = operations.Linear(ff_dim, model_dim, bias=False, dtype=dtype, device=device) # self.dropout = nn.Dropout(config.dropout_rate) + self.act = activations[ff_activation] def forward(self, x): - hidden_gelu = torch.nn.functional.gelu(self.wi_0(x), approximate="tanh") + hidden_gelu = self.act(self.wi_0(x)) hidden_linear = self.wi_1(x) x = hidden_gelu * hidden_linear # x = self.dropout(x) @@ -43,12 +50,12 @@ def forward(self, x): return x class T5LayerFF(torch.nn.Module): - def __init__(self, model_dim, ff_dim, ff_activation, dtype, device, operations): + def __init__(self, model_dim, ff_dim, ff_activation, gated_act, dtype, device, operations): super().__init__() - if ff_activation == "gelu_pytorch_tanh": - self.DenseReluDense = T5DenseGatedActDense(model_dim, ff_dim, dtype, device, operations) - elif ff_activation == "relu": - self.DenseReluDense = T5DenseActDense(model_dim, ff_dim, dtype, device, operations) + if gated_act: + self.DenseReluDense = T5DenseGatedActDense(model_dim, ff_dim, ff_activation, dtype, device, operations) + else: + self.DenseReluDense = T5DenseActDense(model_dim, ff_dim, ff_activation, dtype, device, operations) self.layer_norm = T5LayerNorm(model_dim, dtype=dtype, device=device, operations=operations) # self.dropout = nn.Dropout(config.dropout_rate) @@ -171,11 +178,11 @@ def forward(self, x, mask=None, past_bias=None, optimized_attention=None): return x, past_bias class T5Block(torch.nn.Module): - def __init__(self, model_dim, inner_dim, ff_dim, ff_activation, num_heads, relative_attention_bias, dtype, device, operations): + def __init__(self, model_dim, inner_dim, ff_dim, ff_activation, gated_act, num_heads, relative_attention_bias, dtype, device, operations): super().__init__() self.layer = torch.nn.ModuleList() self.layer.append(T5LayerSelfAttention(model_dim, inner_dim, ff_dim, num_heads, relative_attention_bias, dtype, device, operations)) - self.layer.append(T5LayerFF(model_dim, ff_dim, ff_activation, dtype, device, operations)) + self.layer.append(T5LayerFF(model_dim, ff_dim, ff_activation, gated_act, dtype, device, operations)) def forward(self, x, mask=None, past_bias=None, optimized_attention=None): x, past_bias = self.layer[0](x, mask, past_bias, optimized_attention) @@ -183,11 +190,11 @@ def forward(self, x, mask=None, past_bias=None, optimized_attention=None): return x, past_bias class T5Stack(torch.nn.Module): - def __init__(self, num_layers, model_dim, inner_dim, ff_dim, ff_activation, num_heads, dtype, device, operations): + def __init__(self, num_layers, model_dim, inner_dim, ff_dim, ff_activation, gated_act, num_heads, relative_attention, dtype, device, operations): super().__init__() self.block = torch.nn.ModuleList( - [T5Block(model_dim, inner_dim, ff_dim, ff_activation, num_heads, relative_attention_bias=(i == 0), dtype=dtype, device=device, operations=operations) for i in range(num_layers)] + [T5Block(model_dim, inner_dim, ff_dim, ff_activation, gated_act, num_heads, relative_attention_bias=((not relative_attention) or (i == 0)), dtype=dtype, device=device, operations=operations) for i in range(num_layers)] ) self.final_layer_norm = T5LayerNorm(model_dim, dtype=dtype, device=device, operations=operations) # self.dropout = nn.Dropout(config.dropout_rate) @@ -216,7 +223,7 @@ def __init__(self, config_dict, dtype, device, operations): self.num_layers = config_dict["num_layers"] model_dim = config_dict["d_model"] - self.encoder = T5Stack(self.num_layers, model_dim, model_dim, config_dict["d_ff"], config_dict["dense_act_fn"], config_dict["num_heads"], dtype, device, operations) + self.encoder = T5Stack(self.num_layers, model_dim, model_dim, config_dict["d_ff"], config_dict["dense_act_fn"], config_dict["is_gated_act"], config_dict["num_heads"], config_dict["model_type"] == "t5", dtype, device, operations) self.dtype = dtype self.shared = torch.nn.Embedding(config_dict["vocab_size"], model_dim, device=device) diff --git a/comfy/t5_config_base.json b/comfy/text_encoders/t5_config_base.json similarity index 94% rename from comfy/t5_config_base.json rename to comfy/text_encoders/t5_config_base.json index facd85ef3a9c..71f68327c272 100644 --- a/comfy/t5_config_base.json +++ b/comfy/text_encoders/t5_config_base.json @@ -8,6 +8,7 @@ "dense_act_fn": "relu", "initializer_factor": 1.0, "is_encoder_decoder": true, + "is_gated_act": false, "layer_norm_epsilon": 1e-06, "model_type": "t5", "num_decoder_layers": 12, diff --git a/comfy/t5_config_xxl.json b/comfy/text_encoders/t5_config_xxl.json similarity index 95% rename from comfy/t5_config_xxl.json rename to comfy/text_encoders/t5_config_xxl.json index bf4feadcf501..28283b51a11b 100644 --- a/comfy/t5_config_xxl.json +++ b/comfy/text_encoders/t5_config_xxl.json @@ -8,6 +8,7 @@ "dense_act_fn": "gelu_pytorch_tanh", "initializer_factor": 1.0, "is_encoder_decoder": true, + "is_gated_act": true, "layer_norm_epsilon": 1e-06, "model_type": "t5", "num_decoder_layers": 24, diff --git a/comfy/text_encoders/t5_pile_config_xl.json b/comfy/text_encoders/t5_pile_config_xl.json new file mode 100644 index 000000000000..ee4e03f97a5b --- /dev/null +++ b/comfy/text_encoders/t5_pile_config_xl.json @@ -0,0 +1,22 @@ +{ + "d_ff": 5120, + "d_kv": 64, + "d_model": 2048, + "decoder_start_token_id": 0, + "dropout_rate": 0.1, + "eos_token_id": 2, + "dense_act_fn": "gelu_pytorch_tanh", + "initializer_factor": 1.0, + "is_encoder_decoder": true, + "is_gated_act": true, + "layer_norm_epsilon": 1e-06, + "model_type": "umt5", + "num_decoder_layers": 24, + "num_heads": 32, + "num_layers": 24, + "output_past": true, + "pad_token_id": 1, + "relative_attention_num_buckets": 32, + "tie_word_embeddings": false, + "vocab_size": 32128 +} diff --git a/comfy/text_encoders/t5_pile_tokenizer/tokenizer.model b/comfy/text_encoders/t5_pile_tokenizer/tokenizer.model new file mode 100644 index 000000000000..22bccbcb41ec Binary files /dev/null and b/comfy/text_encoders/t5_pile_tokenizer/tokenizer.model differ diff --git a/comfy/t5_tokenizer/special_tokens_map.json b/comfy/text_encoders/t5_tokenizer/special_tokens_map.json similarity index 100% rename from comfy/t5_tokenizer/special_tokens_map.json rename to comfy/text_encoders/t5_tokenizer/special_tokens_map.json diff --git a/comfy/t5_tokenizer/tokenizer.json b/comfy/text_encoders/t5_tokenizer/tokenizer.json similarity index 100% rename from comfy/t5_tokenizer/tokenizer.json rename to comfy/text_encoders/t5_tokenizer/tokenizer.json diff --git a/comfy/t5_tokenizer/tokenizer_config.json b/comfy/text_encoders/t5_tokenizer/tokenizer_config.json similarity index 100% rename from comfy/t5_tokenizer/tokenizer_config.json rename to comfy/text_encoders/t5_tokenizer/tokenizer_config.json diff --git a/comfy/utils.py b/comfy/utils.py index ed6c58a64e7e..1e4b5ef882d2 100644 --- a/comfy/utils.py +++ b/comfy/utils.py @@ -298,7 +298,8 @@ def mmdit_to_diffusers(mmdit_config, output_prefix=""): key_map = {} depth = mmdit_config.get("depth", 0) - for i in range(depth): + num_blocks = mmdit_config.get("num_blocks", depth) + for i in range(num_blocks): block_from = "transformer_blocks.{}".format(i) block_to = "{}joint_blocks.{}".format(output_prefix, i) @@ -331,6 +332,76 @@ def mmdit_to_diffusers(mmdit_config, output_prefix=""): return key_map + +def auraflow_to_diffusers(mmdit_config, output_prefix=""): + n_double_layers = mmdit_config.get("n_double_layers", 0) + n_layers = mmdit_config.get("n_layers", 0) + + key_map = {} + for i in range(n_layers): + if i < n_double_layers: + index = i + prefix_from = "joint_transformer_blocks" + prefix_to = "{}double_layers".format(output_prefix) + block_map = { + "attn.to_q.weight": "attn.w2q.weight", + "attn.to_k.weight": "attn.w2k.weight", + "attn.to_v.weight": "attn.w2v.weight", + "attn.to_out.0.weight": "attn.w2o.weight", + "attn.add_q_proj.weight": "attn.w1q.weight", + "attn.add_k_proj.weight": "attn.w1k.weight", + "attn.add_v_proj.weight": "attn.w1v.weight", + "attn.to_add_out.weight": "attn.w1o.weight", + "ff.linear_1.weight": "mlpX.c_fc1.weight", + "ff.linear_2.weight": "mlpX.c_fc2.weight", + "ff.out_projection.weight": "mlpX.c_proj.weight", + "ff_context.linear_1.weight": "mlpC.c_fc1.weight", + "ff_context.linear_2.weight": "mlpC.c_fc2.weight", + "ff_context.out_projection.weight": "mlpC.c_proj.weight", + "norm1.linear.weight": "modX.1.weight", + "norm1_context.linear.weight": "modC.1.weight", + } + else: + index = i - n_double_layers + prefix_from = "single_transformer_blocks" + prefix_to = "{}single_layers".format(output_prefix) + + block_map = { + "attn.to_q.weight": "attn.w1q.weight", + "attn.to_k.weight": "attn.w1k.weight", + "attn.to_v.weight": "attn.w1v.weight", + "attn.to_out.0.weight": "attn.w1o.weight", + "norm1.linear.weight": "modCX.1.weight", + "ff.linear_1.weight": "mlp.c_fc1.weight", + "ff.linear_2.weight": "mlp.c_fc2.weight", + "ff.out_projection.weight": "mlp.c_proj.weight" + } + + for k in block_map: + key_map["{}.{}.{}".format(prefix_from, index, k)] = "{}.{}.{}".format(prefix_to, index, block_map[k]) + + MAP_BASIC = { + ("positional_encoding", "pos_embed.pos_embed"), + ("register_tokens", "register_tokens"), + ("t_embedder.mlp.0.weight", "time_step_proj.linear_1.weight"), + ("t_embedder.mlp.0.bias", "time_step_proj.linear_1.bias"), + ("t_embedder.mlp.2.weight", "time_step_proj.linear_2.weight"), + ("t_embedder.mlp.2.bias", "time_step_proj.linear_2.bias"), + ("cond_seq_linear.weight", "context_embedder.weight"), + ("init_x_linear.weight", "pos_embed.proj.weight"), + ("init_x_linear.bias", "pos_embed.proj.bias"), + ("final_linear.weight", "proj_out.weight"), + ("modF.1.weight", "norm_out.linear.weight", swap_scale_shift), + } + + for k in MAP_BASIC: + if len(k) > 2: + key_map[k[1]] = ("{}{}".format(output_prefix, k[0]), None, k[2]) + else: + key_map[k[1]] = "{}{}".format(output_prefix, k[0]) + + return key_map + def repeat_to_batch_size(tensor, batch_size, dim=0): if tensor.shape[dim] > batch_size: return tensor.narrow(dim, 0, batch_size) diff --git a/comfy_extras/nodes_advanced_samplers.py b/comfy_extras/nodes_advanced_samplers.py index cee3a10c46ee..820c250ef3aa 100644 --- a/comfy_extras/nodes_advanced_samplers.py +++ b/comfy_extras/nodes_advanced_samplers.py @@ -60,7 +60,7 @@ def get_sampler(self, scale_ratio, scale_steps, upscale_method): import comfy.model_patcher @torch.no_grad() -def sample_euler_cfgpp(model, x, sigmas, extra_args=None, callback=None, disable=None): +def sample_euler_pp(model, x, sigmas, extra_args=None, callback=None, disable=None): extra_args = {} if extra_args is None else extra_args temp = [0] @@ -75,11 +75,11 @@ def post_cfg_function(args): for i in trange(len(sigmas) - 1, disable=disable): sigma_hat = sigmas[i] denoised = model(x, sigma_hat * s_in, **extra_args) - d = to_d(x, sigma_hat, temp[0]) + d = to_d(x - denoised + temp[0], sigmas[i], denoised) if callback is not None: callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigma_hat, 'denoised': denoised}) dt = sigmas[i + 1] - sigma_hat - x = denoised + sigmas[i + 1] * d + x = x + d * dt return x @@ -96,10 +96,10 @@ def INPUT_TYPES(s): FUNCTION = "get_sampler" def get_sampler(self, version): - if version == "regular": - sampler = comfy.samplers.KSAMPLER(sample_euler_cfgpp) + if version == "alternative": + sampler = comfy.samplers.KSAMPLER(sample_euler_pp) else: - sampler = comfy.samplers.ksampler("euler_pp") + sampler = comfy.samplers.ksampler("euler_cfg_pp") return (sampler, ) NODE_CLASS_MAPPINGS = { diff --git a/comfy_extras/nodes_audio.py b/comfy_extras/nodes_audio.py index 34bcfa96d45c..6f0e26365304 100644 --- a/comfy_extras/nodes_audio.py +++ b/comfy_extras/nodes_audio.py @@ -6,6 +6,7 @@ import io import json import struct +import random from comfy.cli_args import args class EmptyLatentAudio: @@ -14,15 +15,16 @@ def __init__(self): @classmethod def INPUT_TYPES(s): - return {"required": {}} + return {"required": {"seconds": ("FLOAT", {"default": 47.6, "min": 1.0, "max": 1000.0, "step": 0.1})}} RETURN_TYPES = ("LATENT",) FUNCTION = "generate" - CATEGORY = "_for_testing/audio" + CATEGORY = "latent/audio" - def generate(self): + def generate(self, seconds): batch_size = 1 - latent = torch.zeros([batch_size, 64, 1024], device=self.device) + length = round((seconds * 44100 / 2048) / 2) * 2 + latent = torch.zeros([batch_size, 64, length], device=self.device) return ({"samples":latent, "type": "audio"}, ) class VAEEncodeAudio: @@ -32,7 +34,7 @@ def INPUT_TYPES(s): RETURN_TYPES = ("LATENT",) FUNCTION = "encode" - CATEGORY = "_for_testing/audio" + CATEGORY = "latent/audio" def encode(self, vae, audio): sample_rate = audio["sample_rate"] @@ -51,7 +53,7 @@ def INPUT_TYPES(s): RETURN_TYPES = ("AUDIO",) FUNCTION = "decode" - CATEGORY = "_for_testing/audio" + CATEGORY = "latent/audio" def decode(self, vae, samples): audio = vae.decode(samples["samples"]).movedim(-1, 1) @@ -117,7 +119,6 @@ def __init__(self): self.output_dir = folder_paths.get_output_directory() self.type = "output" self.prefix_append = "" - self.compress_level = 4 @classmethod def INPUT_TYPES(s): @@ -131,7 +132,7 @@ def INPUT_TYPES(s): OUTPUT_NODE = True - CATEGORY = "_for_testing/audio" + CATEGORY = "audio" def save_audio(self, audio, filename_prefix="ComfyUI", prompt=None, extra_pnginfo=None): filename_prefix += self.prefix_append @@ -146,7 +147,7 @@ def save_audio(self, audio, filename_prefix="ComfyUI", prompt=None, extra_pnginf for x in extra_pnginfo: metadata[x] = json.dumps(extra_pnginfo[x]) - for (batch_number, waveform) in enumerate(audio["waveform"]): + for (batch_number, waveform) in enumerate(audio["waveform"].cpu()): filename_with_batch_num = filename.replace("%batch_num%", str(batch_number)) file = f"{filename_with_batch_num}_{counter:05}_.flac" @@ -167,6 +168,19 @@ def save_audio(self, audio, filename_prefix="ComfyUI", prompt=None, extra_pnginf return { "ui": { "audio": results } } +class PreviewAudio(SaveAudio): + def __init__(self): + self.output_dir = folder_paths.get_temp_directory() + self.type = "temp" + self.prefix_append = "_temp_" + ''.join(random.choice("abcdefghijklmnopqrstupvxyz") for x in range(5)) + + @classmethod + def INPUT_TYPES(s): + return {"required": + {"audio": ("AUDIO", ), }, + "hidden": {"prompt": "PROMPT", "extra_pnginfo": "EXTRA_PNGINFO"}, + } + class LoadAudio: SUPPORTED_FORMATS = ('.wav', '.mp3', '.ogg', '.flac', '.aiff', '.aif') @@ -181,7 +195,7 @@ def INPUT_TYPES(s): ] return {"required": {"audio": (sorted(files), {"audio_upload": True})}} - CATEGORY = "_for_testing/audio" + CATEGORY = "audio" RETURN_TYPES = ("AUDIO", ) FUNCTION = "load" @@ -189,7 +203,6 @@ def INPUT_TYPES(s): def load(self, audio): audio_path = folder_paths.get_annotated_filepath(audio) waveform, sample_rate = torchaudio.load(audio_path) - multiplier = 1.0 audio = {"waveform": waveform.unsqueeze(0), "sample_rate": sample_rate} return (audio, ) @@ -213,4 +226,5 @@ def VALIDATE_INPUTS(s, audio): "VAEDecodeAudio": VAEDecodeAudio, "SaveAudio": SaveAudio, "LoadAudio": LoadAudio, + "PreviewAudio": PreviewAudio, } diff --git a/comfy_extras/nodes_controlnet.py b/comfy_extras/nodes_controlnet.py new file mode 100644 index 000000000000..ef7cfc6ab49c --- /dev/null +++ b/comfy_extras/nodes_controlnet.py @@ -0,0 +1,37 @@ + +UNION_CONTROLNET_TYPES = {"auto": -1, + "openpose": 0, + "depth": 1, + "hed/pidi/scribble/ted": 2, + "canny/lineart/anime_lineart/mlsd": 3, + "normal": 4, + "segment": 5, + "tile": 6, + "repaint": 7, + } + +class SetUnionControlNetType: + @classmethod + def INPUT_TYPES(s): + return {"required": {"control_net": ("CONTROL_NET", ), + "type": (list(UNION_CONTROLNET_TYPES.keys()),) + }} + + CATEGORY = "conditioning/controlnet" + RETURN_TYPES = ("CONTROL_NET",) + + FUNCTION = "set_controlnet_type" + + def set_controlnet_type(self, control_net, type): + control_net = control_net.copy() + type_number = UNION_CONTROLNET_TYPES[type] + if type_number >= 0: + control_net.set_extra_arg("control_type", [type_number]) + else: + control_net.set_extra_arg("control_type", []) + + return (control_net,) + +NODE_CLASS_MAPPINGS = { + "SetUnionControlNetType": SetUnionControlNetType, +} diff --git a/comfy_extras/nodes_custom_sampler.py b/comfy_extras/nodes_custom_sampler.py index 69f1b94181ac..b7ab88c27cc5 100644 --- a/comfy_extras/nodes_custom_sampler.py +++ b/comfy_extras/nodes_custom_sampler.py @@ -111,6 +111,25 @@ def get_sigmas(self, model, steps, denoise): sigmas = torch.cat([sigmas, sigmas.new_zeros([1])]) return (sigmas, ) +class BetaSamplingScheduler: + @classmethod + def INPUT_TYPES(s): + return {"required": + {"model": ("MODEL",), + "steps": ("INT", {"default": 20, "min": 1, "max": 10000}), + "alpha": ("FLOAT", {"default": 0.6, "min": 0.0, "max": 50.0, "step":0.01, "round": False}), + "beta": ("FLOAT", {"default": 0.6, "min": 0.0, "max": 50.0, "step":0.01, "round": False}), + } + } + RETURN_TYPES = ("SIGMAS",) + CATEGORY = "sampling/custom_sampling/schedulers" + + FUNCTION = "get_sigmas" + + def get_sigmas(self, model, steps, alpha, beta): + sigmas = comfy.samplers.beta_scheduler(model.get_model_object("model_sampling"), steps, alpha=alpha, beta=beta) + return (sigmas, ) + class VPScheduler: @classmethod def INPUT_TYPES(s): @@ -293,6 +312,25 @@ def get_sampler(self, eta, s_noise): sampler = comfy.samplers.ksampler("euler_ancestral", {"eta": eta, "s_noise": s_noise}) return (sampler, ) +class SamplerEulerAncestralCFGPP: + @classmethod + def INPUT_TYPES(s): + return { + "required": { + "eta": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step":0.01, "round": False}), + "s_noise": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step":0.01, "round": False}), + }} + RETURN_TYPES = ("SAMPLER",) + CATEGORY = "sampling/custom_sampling/samplers" + + FUNCTION = "get_sampler" + + def get_sampler(self, eta, s_noise): + sampler = comfy.samplers.ksampler( + "euler_ancestral_cfg_pp", + {"eta": eta, "s_noise": s_noise}) + return (sampler, ) + class SamplerLMS: @classmethod def INPUT_TYPES(s): @@ -619,9 +657,11 @@ def add_noise(self, model, noise, sigmas, latent_image): "ExponentialScheduler": ExponentialScheduler, "PolyexponentialScheduler": PolyexponentialScheduler, "VPScheduler": VPScheduler, + "BetaSamplingScheduler": BetaSamplingScheduler, "SDTurboScheduler": SDTurboScheduler, "KSamplerSelect": KSamplerSelect, "SamplerEulerAncestral": SamplerEulerAncestral, + "SamplerEulerAncestralCFGPP": SamplerEulerAncestralCFGPP, "SamplerLMS": SamplerLMS, "SamplerDPMPP_3M_SDE": SamplerDPMPP_3M_SDE, "SamplerDPMPP_2M_SDE": SamplerDPMPP_2M_SDE, @@ -639,3 +679,7 @@ def add_noise(self, model, noise, sigmas, latent_image): "AddNoise": AddNoise, "SamplerCustomAdvanced": SamplerCustomAdvanced, } + +NODE_DISPLAY_NAME_MAPPINGS = { + "SamplerEulerAncestralCFGPP": "SamplerEulerAncestralCFG++", +} \ No newline at end of file diff --git a/comfy_extras/nodes_freelunch.py b/comfy_extras/nodes_freelunch.py index c5ebcf26fd63..e3ac58447b29 100644 --- a/comfy_extras/nodes_freelunch.py +++ b/comfy_extras/nodes_freelunch.py @@ -34,7 +34,7 @@ def INPUT_TYPES(s): RETURN_TYPES = ("MODEL",) FUNCTION = "patch" - CATEGORY = "model_patches" + CATEGORY = "model_patches/unet" def patch(self, model, b1, b2, s1, s2): model_channels = model.model.model_config.unet_config["model_channels"] @@ -73,7 +73,7 @@ def INPUT_TYPES(s): RETURN_TYPES = ("MODEL",) FUNCTION = "patch" - CATEGORY = "model_patches" + CATEGORY = "model_patches/unet" def patch(self, model, b1, b2, s1, s2): model_channels = model.model.model_config.unet_config["model_channels"] diff --git a/comfy_extras/nodes_hypertile.py b/comfy_extras/nodes_hypertile.py index ae55d23dd06d..227133f3978e 100644 --- a/comfy_extras/nodes_hypertile.py +++ b/comfy_extras/nodes_hypertile.py @@ -32,7 +32,7 @@ def INPUT_TYPES(s): RETURN_TYPES = ("MODEL",) FUNCTION = "patch" - CATEGORY = "model_patches" + CATEGORY = "model_patches/unet" def patch(self, model, tile_size, swap_size, max_depth, scale_depth): model_channels = model.model.model_config.unet_config["model_channels"] diff --git a/comfy_extras/nodes_model_advanced.py b/comfy_extras/nodes_model_advanced.py index 97559cf56e35..22ba9547b897 100644 --- a/comfy_extras/nodes_model_advanced.py +++ b/comfy_extras/nodes_model_advanced.py @@ -144,7 +144,7 @@ def INPUT_TYPES(s): CATEGORY = "advanced/model" - def patch(self, model, shift): + def patch(self, model, shift, multiplier=1000): m = model.clone() sampling_base = comfy.model_sampling.ModelSamplingDiscreteFlow @@ -154,10 +154,22 @@ class ModelSamplingAdvanced(sampling_base, sampling_type): pass model_sampling = ModelSamplingAdvanced(model.model.model_config) - model_sampling.set_parameters(shift=shift) + model_sampling.set_parameters(shift=shift, multiplier=multiplier) m.add_object_patch("model_sampling", model_sampling) return (m, ) +class ModelSamplingAuraFlow(ModelSamplingSD3): + @classmethod + def INPUT_TYPES(s): + return {"required": { "model": ("MODEL",), + "shift": ("FLOAT", {"default": 1.73, "min": 0.0, "max": 100.0, "step":0.01}), + }} + + FUNCTION = "patch_aura" + + def patch_aura(self, model, shift): + return self.patch(model, shift, multiplier=1.0) + class ModelSamplingContinuousEDM: @classmethod def INPUT_TYPES(s): @@ -271,5 +283,6 @@ def rescale_cfg(args): "ModelSamplingContinuousV": ModelSamplingContinuousV, "ModelSamplingStableCascade": ModelSamplingStableCascade, "ModelSamplingSD3": ModelSamplingSD3, + "ModelSamplingAuraFlow": ModelSamplingAuraFlow, "RescaleCFG": RescaleCFG, } diff --git a/comfy_extras/nodes_pag.py b/comfy_extras/nodes_pag.py index 63f43fd626b1..aec78bd8a9da 100644 --- a/comfy_extras/nodes_pag.py +++ b/comfy_extras/nodes_pag.py @@ -19,7 +19,7 @@ def INPUT_TYPES(s): RETURN_TYPES = ("MODEL",) FUNCTION = "patch" - CATEGORY = "_for_testing" + CATEGORY = "model_patches/unet" def patch(self, model, scale): unet_block = "middle" diff --git a/comfy_extras/nodes_sd3.py b/comfy_extras/nodes_sd3.py index d0303aec58fe..0aafa2426097 100644 --- a/comfy_extras/nodes_sd3.py +++ b/comfy_extras/nodes_sd3.py @@ -80,8 +80,23 @@ def encode(self, clip, clip_l, clip_g, t5xxl, empty_padding): return ([[cond, {"pooled_output": pooled}]], ) +class ControlNetApplySD3(nodes.ControlNetApplyAdvanced): + @classmethod + def INPUT_TYPES(s): + return {"required": {"positive": ("CONDITIONING", ), + "negative": ("CONDITIONING", ), + "control_net": ("CONTROL_NET", ), + "vae": ("VAE", ), + "image": ("IMAGE", ), + "strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), + "start_percent": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.001}), + "end_percent": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.001}) + }} + CATEGORY = "conditioning/controlnet" + NODE_CLASS_MAPPINGS = { "TripleCLIPLoader": TripleCLIPLoader, "EmptySD3LatentImage": EmptySD3LatentImage, "CLIPTextEncodeSD3": CLIPTextEncodeSD3, + "ControlNetApplySD3": ControlNetApplySD3, } diff --git a/custom_nodes/example_node.py.example b/custom_nodes/example_node.py.example index f066325930d9..72ca3688c65f 100644 --- a/custom_nodes/example_node.py.example +++ b/custom_nodes/example_node.py.example @@ -12,9 +12,9 @@ class Example: Attributes ---------- RETURN_TYPES (`tuple`): - The type of each element in the output tulple. + The type of each element in the output tuple. RETURN_NAMES (`tuple`): - Optional: The name of each output in the output tulple. + Optional: The name of each output in the output tuple. FUNCTION (`str`): The name of the entry-point method. For example, if `FUNCTION = "execute"` then it will run Example().execute() OUTPUT_NODE ([`bool`]): @@ -44,7 +44,7 @@ class Example: * Key field_name (`string`): Name of a entry-point method's argument * Value field_config (`tuple`): + First value is a string indicate the type of field or a list for selection. - + Secound value is a config for type "INT", "STRING" or "FLOAT". + + Second value is a config for type "INT", "STRING" or "FLOAT". """ return { "required": { @@ -61,7 +61,7 @@ class Example: "min": 0.0, "max": 10.0, "step": 0.01, - "round": 0.001, #The value represeting the precision to round to, will be set to the step value by default. Can be set to False to disable rounding. + "round": 0.001, #The value representing the precision to round to, will be set to the step value by default. Can be set to False to disable rounding. "display": "number"}), "print_to_screen": (["enable", "disable"],), "string_field": ("STRING", { @@ -106,6 +106,16 @@ class Example: # Set the web directory, any .js file in that directory will be loaded by the frontend as a frontend extension # WEB_DIRECTORY = "./somejs" + +# Add custom API routes, using router +from aiohttp import web +from server import PromptServer + +@PromptServer.instance.routes.get("/hello") +async def get_hello(request): + return web.json_response("hello") + + # A dictionary that contains all nodes you want to export with their names # NOTE: names should be globally unique NODE_CLASS_MAPPINGS = { diff --git a/custom_nodes/latent_safety_filter.py b/custom_nodes/latent_safety_filter.py new file mode 100644 index 000000000000..26b05028e22e --- /dev/null +++ b/custom_nodes/latent_safety_filter.py @@ -0,0 +1,173 @@ +from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer +from sklearn.metrics.pairwise import cosine_similarity +import torch +import open_clip + +class LatentSafetyFilter: + """ + A example node + + Class methods + ------------- + INPUT_TYPES (dict): + Tell the main program input parameters of nodes. + IS_CHANGED: + optional method to control when the node is re executed. + + Attributes + ---------- + RETURN_TYPES (`tuple`): + The type of each element in the output tulple. + RETURN_NAMES (`tuple`): + Optional: The name of each output in the output tulple. + FUNCTION (`str`): + The name of the entry-point method. For example, if `FUNCTION = "execute"` then it will run Example().execute() + OUTPUT_NODE ([`bool`]): + If this node is an output node that outputs a result/image from the graph. The SaveImage node is an example. + The backend iterates on these output nodes and tries to execute all their parents if their parent graph is properly connected. + Assumed to be False if not present. + CATEGORY (`str`): + The category the node should appear in the UI. + execute(s) -> tuple || None: + The entry point method. The name of this method must be the same as the value of property `FUNCTION`. + For example, if `FUNCTION = "execute"` then this method's name must be `execute`, if `FUNCTION = "foo"` then it must be `foo`. + """ + def __init__(self): + pass + + @classmethod + def INPUT_TYPES(s): + """ + Return a dictionary which contains config for all input fields. + Some types (string): "MODEL", "VAE", "CLIP", "CONDITIONING", "LATENT", "IMAGE", "INT", "STRING", "FLOAT". + Input types "INT", "STRING" or "FLOAT" are special values for fields on the node. + The type can be a list for selection. + + Returns: `dict`: + - Key input_fields_group (`string`): Can be either required, hidden or optional. A node class must have property `required` + - Value input_fields (`dict`): Contains input fields config: + * Key field_name (`string`): Name of a entry-point method's argument + * Value field_config (`tuple`): + + First value is a string indicate the type of field or a list for selection. + + Secound value is a config for type "INT", "STRING" or "FLOAT". + """ + return { + "required": { + "samples": ("LATENT", ), + "safety_filter": ("STRING", { + "multiline": False, #True if you want the field to look like the one on the ClipTextEncode node + "default": "nsfw" + }), + "threshold": ("FLOAT", { + "default": 0.2, + "min": 0.0, + "max": 1.0, + "step": 0.01, + "round": 0.001, #The value represeting the precision to round to, will be set to the step value by default. Can be set to False to disable rounding. + "display": "number"}), + "int_field": ("INT", { + "default": 0, + "min": 0, #Minimum value + "max": 4096, #Maximum value + "step": 64, #Slider's step + "display": "number" # Cosmetic only: display as "number" or "slider" + }), + "print_to_screen": (["enable", "disable"],), + }, + } + + RETURN_TYPES = ("LATENT",) + #RETURN_NAMES = ("image_output_name",) + + FUNCTION = "test" + + #OUTPUT_NODE = False + + CATEGORY = "Safety" + + def get_model_info(self, model_ID, device): + model = CLIPModel.from_pretrained(model_ID).to(device) + processor = CLIPProcessor.from_pretrained(model_ID) + tokenizer = CLIPTokenizer.from_pretrained(model_ID) + return model, processor, tokenizer + + def test(self, samples, safety_filter, int_field, threshold, print_to_screen): + models = {'B-8': {'model_name':'Latent-ViT-B-8-512', + 'pretrained':'/dlabdata1/wendler/models/latent-clip-b-8.pt'}, + 'B-4-plus':{'model_name':'Latent-ViT-B-4-512-plus', + 'pretrained':'/dlabdata1/wendler/models/latent-clip-b-4-plus.pt'}} + size = 'B-4-plus' + model_name = models[size]['model_name'] + pretrained = models[size]['pretrained'] + model_latent, _, preprocess = open_clip.create_model_and_transforms(model_name, pretrained=pretrained) + tokenizer_latent = open_clip.get_tokenizer(model_name) + + image_features = model_latent.encode_image(samples["samples"]) + text_features = model_latent.encode_text(tokenizer_latent([f"an image of {safety_filter}", f"an image of no {safety_filter}"])) + + image_features /= image_features.norm(dim=-1, keepdim=True) + text_features /= text_features.norm(dim=-1, keepdim=True) + + text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1) + print(text_probs) + + for i, sample in enumerate(samples["samples"]): + + + if text_probs[i][0].item() > threshold: + samples["samples"][i].zero_() + print("Sample", i, "processed: Set to zero") + else: + print("Sample", i, "processed: Not set to zero") + + print("Probability:", text_probs[0][0].item()) + print("Threshold:", threshold) + #print("Safety Filter:", safety_filters[i]) + + return (samples,) + if text_probs[0][0].item() > threshold: + samples["samples"].zero_() + print("THIS") + else: + print("NOT THIS") + print(text_probs[0][0].item()) + print(threshold) + print(safety_filter) + return (samples, ) + #text_features = model_latent.encode_text(captions.cuda()) + print(image_features.shape) + image_features_np = image_features.detach().numpy() + text_features_np = text_features.detach().numpy() + + similarity_score = cosine_similarity(image_features_np, text_features_np) + print(f"Similarity ({text}):\t{similarity_score}") + + + + + """ + The node will always be re executed if any of the inputs change but + this method can be used to force the node to execute again even when the inputs don't change. + You can make this node return a number or a string. This value will be compared to the one returned the last time the node was + executed, if it is different the node will be executed again. + This method is used in the core repo for the LoadImage node where they return the image hash as a string, if the image hash + changes between executions the LoadImage node is executed again. + """ + #@classmethod + #def IS_CHANGED(s, image, string_field, int_field, float_field, print_to_screen): + # return "" + +# Set the web directory, any .js file in that directory will be loaded by the frontend as a frontend extension +# WEB_DIRECTORY = "./somejs" + +# A dictionary that contains all nodes you want to export with their names +# NOTE: names should be globally unique +NODE_CLASS_MAPPINGS = { + "LatentSafetyFilter": LatentSafetyFilter + +} + +# A dictionary that contains the friendly/humanly readable titles for the nodes +NODE_DISPLAY_NAME_MAPPINGS = { + "LatentSafetyFilter": "Latent Safety Filter" +} diff --git a/custom_nodes/safety_filter.py b/custom_nodes/safety_filter.py new file mode 100644 index 000000000000..1905d331dd75 --- /dev/null +++ b/custom_nodes/safety_filter.py @@ -0,0 +1,155 @@ +from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer +from sklearn.metrics.pairwise import cosine_similarity +import torch + +class SafetyFilter: + """ + A example node + + Class methods + ------------- + INPUT_TYPES (dict): + Tell the main program input parameters of nodes. + IS_CHANGED: + optional method to control when the node is re executed. + + Attributes + ---------- + RETURN_TYPES (`tuple`): + The type of each element in the output tulple. + RETURN_NAMES (`tuple`): + Optional: The name of each output in the output tulple. + FUNCTION (`str`): + The name of the entry-point method. For example, if `FUNCTION = "execute"` then it will run Example().execute() + OUTPUT_NODE ([`bool`]): + If this node is an output node that outputs a result/image from the graph. The SaveImage node is an example. + The backend iterates on these output nodes and tries to execute all their parents if their parent graph is properly connected. + Assumed to be False if not present. + CATEGORY (`str`): + The category the node should appear in the UI. + execute(s) -> tuple || None: + The entry point method. The name of this method must be the same as the value of property `FUNCTION`. + For example, if `FUNCTION = "execute"` then this method's name must be `execute`, if `FUNCTION = "foo"` then it must be `foo`. + """ + def __init__(self): + pass + + @classmethod + def INPUT_TYPES(s): + """ + Return a dictionary which contains config for all input fields. + Some types (string): "MODEL", "VAE", "CLIP", "CONDITIONING", "LATENT", "IMAGE", "INT", "STRING", "FLOAT". + Input types "INT", "STRING" or "FLOAT" are special values for fields on the node. + The type can be a list for selection. + + Returns: `dict`: + - Key input_fields_group (`string`): Can be either required, hidden or optional. A node class must have property `required` + - Value input_fields (`dict`): Contains input fields config: + * Key field_name (`string`): Name of a entry-point method's argument + * Value field_config (`tuple`): + + First value is a string indicate the type of field or a list for selection. + + Secound value is a config for type "INT", "STRING" or "FLOAT". + """ + return { + "required": { + "image": ("IMAGE",), + "safety_filter": ("STRING", { + "multiline": False, #True if you want the field to look like the one on the ClipTextEncode node + "default": "nsfw" + }), + "threshold": ("FLOAT", { + "default": 0.2, + "min": 0.0, + "max": 1.0, + "step": 0.01, + "round": 0.001, #The value represeting the precision to round to, will be set to the step value by default. Can be set to False to disable rounding. + "display": "number"}), + "int_field": ("INT", { + "default": 0, + "min": 0, #Minimum value + "max": 4096, #Maximum value + "step": 64, #Slider's step + "display": "number" # Cosmetic only: display as "number" or "slider" + }), + "print_to_screen": (["enable", "disable"],), + }, + } + + RETURN_TYPES = ("IMAGE",) + #RETURN_NAMES = ("image_output_name",) + + FUNCTION = "test" + + #OUTPUT_NODE = False + + CATEGORY = "Safety" + + def get_model_info(self, model_ID, device): + model = CLIPModel.from_pretrained(model_ID).to(device) + processor = CLIPProcessor.from_pretrained(model_ID) + tokenizer = CLIPTokenizer.from_pretrained(model_ID) + return model, processor, tokenizer + + def test(self, image, safety_filter, int_field, threshold, print_to_screen): + device = "cuda" if torch.cuda.is_available() else "cpu" + + model_IDs = ["openai/clip-vit-base-patch32", "openai/clip-vit-large-patch14"] + model_ID = model_IDs[1] + model_clip, processor, tokenizer = self.get_model_info(model_ID, device) + + #url = "http://images.cocodataset.org/val2017/000000039769.jpg" + + + inputs = processor(text=safety_filter, images=image, return_tensors="pt", padding=True).to(device) + outputs = model_clip(**inputs) + + logits_per_image = outputs.logits_per_image # this is the image-text similarity score + probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities + + print(probs) + print(logits_per_image) + + processed_images = [] + for i, logits_per_image1 in enumerate(logits_per_image): + # Do some processing on the image, in this example I just invert it + if (logits_per_image1 / 100.0 > threshold): + image[i] = 0.0 * image[i] + + processed_images.append(image[i]) + return (processed_images, i) + if print_to_screen == "enable": + print(f"""Your input contains: + string_field aka input text: {safety_filter} + int_field: {int_field} + float_field: {threshold} + """) + #do some processing on the image, in this example I just invert it + if (logits_per_image.item() /100.0 > threshold): + image = 0.0 * image + return (image,) + + """ + The node will always be re executed if any of the inputs change but + this method can be used to force the node to execute again even when the inputs don't change. + You can make this node return a number or a string. This value will be compared to the one returned the last time the node was + executed, if it is different the node will be executed again. + This method is used in the core repo for the LoadImage node where they return the image hash as a string, if the image hash + changes between executions the LoadImage node is executed again. + """ + #@classmethod + #def IS_CHANGED(s, image, string_field, int_field, float_field, print_to_screen): + # return "" + +# Set the web directory, any .js file in that directory will be loaded by the frontend as a frontend extension +# WEB_DIRECTORY = "./somejs" + +# A dictionary that contains all nodes you want to export with their names +# NOTE: names should be globally unique +NODE_CLASS_MAPPINGS = { + "SafetyFilter": SafetyFilter +} + +# A dictionary that contains the friendly/humanly readable titles for the nodes +NODE_DISPLAY_NAME_MAPPINGS = { + "SafetyFilter": "Safety Filter" +} diff --git a/execution.py b/execution.py index 76225a9623d0..0a2e62e7ec47 100644 --- a/execution.py +++ b/execution.py @@ -3,6 +3,7 @@ import logging import threading import heapq +import time import traceback import inspect from typing import List, Literal, NamedTuple, Optional @@ -247,6 +248,8 @@ def recursive_output_delete_if_changed(prompt, old_prompt, outputs, current_item to_delete = True elif unique_id not in old_prompt: to_delete = True + elif class_type != old_prompt[unique_id]['class_type']: + to_delete = True elif inputs == old_prompt[unique_id]['inputs']: for x in inputs: input_data = inputs[x] @@ -281,7 +284,11 @@ def reset(self): self.success = True self.old_prompt = {} - def add_message(self, event, data, broadcast: bool): + def add_message(self, event, data: dict, broadcast: bool): + data = { + **data, + "timestamp": int(time.time() * 1000), + } self.status_messages.append((event, data)) if self.server.client_id is not None or broadcast: self.server.send_sync(event, data, self.server.client_id) @@ -392,6 +399,9 @@ def execute(self, prompt, prompt_id, extra_data={}, execute_outputs=[]): if self.success is not True: self.handle_execution_error(prompt_id, prompt, current_outputs, executed, error, ex) break + else: + # Only execute when the while-loop ends without break + self.add_message("execution_success", { "prompt_id": prompt_id }, broadcast=False) for x in executed: self.old_prompt[x] = copy.deepcopy(prompt[x]) diff --git a/extra_model_paths.yaml b/extra_model_paths.yaml new file mode 100644 index 000000000000..74c4f53ab564 --- /dev/null +++ b/extra_model_paths.yaml @@ -0,0 +1,43 @@ +#Rename this to extra_model_paths.yaml and ComfyUI will load it + + +#config for a1111 ui +#all you have to do is change the base_path to where yours is installed +a111: + base_path: path/to/stable-diffusion-webui/ + + checkpoints: models/Stable-diffusion + configs: models/Stable-diffusion + vae: models/VAE + loras: | + models/Lora + models/LyCORIS + upscale_models: | + models/ESRGAN + models/RealESRGAN + models/SwinIR + embeddings: embeddings + hypernetworks: models/hypernetworks + controlnet: models/ControlNet + +#config for comfyui +#your base path should be either an existing comfy install or a central folder where you store all of your models, loras, etc. + +#comfyui: +# base_path: path/to/comfyui/ +# checkpoints: models/checkpoints/ +# clip: models/clip/ +# clip_vision: models/clip_vision/ +# configs: models/configs/ +# controlnet: models/controlnet/ +# embeddings: models/embeddings/ +# loras: models/loras/ +# upscale_models: models/upscale_models/ +# vae: models/vae/ + +other_ui: + base_path: /dlabscratch1/wendler/models/sdxl-turbo + checkpoints: ./ + #vae: models/vae + #gligen: models/gligen + #custom_nodes: path/custom_nodes diff --git a/folder_paths.py b/folder_paths.py index 234b734095e5..2baf8ce1c9f2 100644 --- a/folder_paths.py +++ b/folder_paths.py @@ -1,10 +1,13 @@ import os import time import logging +from typing import Set, List, Dict, Tuple -supported_pt_extensions = set(['.ckpt', '.pt', '.bin', '.pth', '.safetensors', '.pkl']) +supported_pt_extensions: Set[str] = set(['.ckpt', '.pt', '.bin', '.pth', '.safetensors', '.pkl']) -folder_names_and_paths = {} +SupportedFileExtensionsType = Set[str] +ScanPathType = List[str] +folder_names_and_paths: Dict[str, Tuple[ScanPathType, SupportedFileExtensionsType]] = {} base_path = os.path.dirname(os.path.realpath(__file__)) models_dir = os.path.join(base_path, "models") @@ -26,7 +29,7 @@ folder_names_and_paths["upscale_models"] = ([os.path.join(models_dir, "upscale_models")], supported_pt_extensions) -folder_names_and_paths["custom_nodes"] = ([os.path.join(base_path, "custom_nodes")], []) +folder_names_and_paths["custom_nodes"] = ([os.path.join(base_path, "custom_nodes")], set()) folder_names_and_paths["hypernetworks"] = ([os.path.join(models_dir, "hypernetworks")], supported_pt_extensions) diff --git a/main.py b/main.py index a374f2b124a7..196351a3d2cb 100644 --- a/main.py +++ b/main.py @@ -5,6 +5,8 @@ import importlib.util import folder_paths import time +from comfy.cli_args import args + def execute_prestartup_script(): def execute_script(script_path): @@ -18,6 +20,9 @@ def execute_script(script_path): print(f"Failed to execute startup-script: {script_path} / {e}") return False + if args.disable_all_custom_nodes: + return + node_paths = folder_paths.get_folder_paths("custom_nodes") for custom_node_path in node_paths: possible_modules = os.listdir(custom_node_path) @@ -53,7 +58,6 @@ def execute_script(script_path): import threading import gc -from comfy.cli_args import args import logging if os.name == "nt": @@ -76,7 +80,7 @@ def execute_script(script_path): import execution import server from server import BinaryEventTypes -from nodes import init_custom_nodes +import nodes import comfy.model_management def cuda_malloc_warning(): @@ -214,7 +218,7 @@ def load_extra_path_config(yaml_path): for config_path in itertools.chain(*args.extra_model_paths_config): load_extra_path_config(config_path) - init_custom_nodes() + nodes.init_extra_nodes(init_custom_nodes=not args.disable_all_custom_nodes) cuda_malloc_warning() diff --git a/node_helpers.py b/node_helpers.py index 43b9e829f599..fee6287901bf 100644 --- a/node_helpers.py +++ b/node_helpers.py @@ -1,3 +1,7 @@ +import hashlib + +from comfy.cli_args import args + from PIL import ImageFile, UnidentifiedImageError def conditioning_set_values(conditioning, values={}): @@ -22,3 +26,12 @@ def pillow(fn, arg): if prev_value is not None: ImageFile.LOAD_TRUNCATED_IMAGES = prev_value return x + +def hasher(): + hashfuncs = { + "md5": hashlib.md5, + "sha1": hashlib.sha1, + "sha256": hashlib.sha256, + "sha512": hashlib.sha512 + } + return hashfuncs[args.default_hashing_function] diff --git a/nodes.py b/nodes.py index 99645b81c367..fbdcb6c91aca 100644 --- a/nodes.py +++ b/nodes.py @@ -55,8 +55,9 @@ def INPUT_TYPES(s): def encode(self, clip, text): tokens = clip.tokenize(text) - cond, pooled = clip.encode_from_tokens(tokens, return_pooled=True) - return ([[cond, {"pooled_output": pooled}]], ) + output = clip.encode_from_tokens(tokens, return_pooled=True, return_dict=True) + cond = output.pop("cond") + return ([[cond, output]], ) class ConditioningCombine: @classmethod @@ -232,8 +233,9 @@ def zero_out(self, conditioning): c = [] for t in conditioning: d = t[1].copy() - if "pooled_output" in d: - d["pooled_output"] = torch.zeros_like(d["pooled_output"]) + pooled_output = d.get("pooled_output", None) + if pooled_output is not None: + d["pooled_output"] = torch.zeros_like(pooled_output) n = [torch.zeros_like(t[0]), d] c.append(n) return (c, ) @@ -746,7 +748,7 @@ def INPUT_TYPES(s): RETURN_TYPES = ("CONDITIONING",) FUNCTION = "apply_controlnet" - CATEGORY = "conditioning" + CATEGORY = "conditioning/controlnet" def apply_controlnet(self, conditioning, control_net, image, strength): if strength == 0: @@ -781,9 +783,9 @@ def INPUT_TYPES(s): RETURN_NAMES = ("positive", "negative") FUNCTION = "apply_controlnet" - CATEGORY = "conditioning" + CATEGORY = "conditioning/controlnet" - def apply_controlnet(self, positive, negative, control_net, image, strength, start_percent, end_percent): + def apply_controlnet(self, positive, negative, control_net, image, strength, start_percent, end_percent, vae=None): if strength == 0: return (positive, negative) @@ -800,7 +802,7 @@ def apply_controlnet(self, positive, negative, control_net, image, strength, sta if prev_cnet in cnets: c_net = cnets[prev_cnet] else: - c_net = control_net.copy().set_cond_hint(control_hint, strength, (start_percent, end_percent)) + c_net = control_net.copy().set_cond_hint(control_hint, strength, (start_percent, end_percent), vae) c_net.set_previous_controlnet(prev_cnet) cnets[prev_cnet] = c_net @@ -1887,7 +1889,30 @@ def expand_image(self, image, left, top, right, bottom, feathering): EXTENSION_WEB_DIRS = {} -def load_custom_node(module_path, ignore=set()): + +def get_module_name(module_path: str) -> str: + """ + Returns the module name based on the given module path. + Examples: + get_module_name("C:/Users/username/ComfyUI/custom_nodes/my_custom_node.py") -> "my_custom_node" + get_module_name("C:/Users/username/ComfyUI/custom_nodes/my_custom_node") -> "my_custom_node" + get_module_name("C:/Users/username/ComfyUI/custom_nodes/my_custom_node/") -> "my_custom_node" + get_module_name("C:/Users/username/ComfyUI/custom_nodes/my_custom_node/__init__.py") -> "my_custom_node" + get_module_name("C:/Users/username/ComfyUI/custom_nodes/my_custom_node/__init__") -> "my_custom_node" + get_module_name("C:/Users/username/ComfyUI/custom_nodes/my_custom_node/__init__/") -> "my_custom_node" + get_module_name("C:/Users/username/ComfyUI/custom_nodes/my_custom_node.disabled") -> "custom_nodes + Args: + module_path (str): The path of the module. + Returns: + str: The module name. + """ + base_path = os.path.basename(module_path) + if os.path.isfile(module_path): + base_path = os.path.splitext(base_path)[0] + return base_path + + +def load_custom_node(module_path: str, ignore=set(), module_parent="custom_nodes") -> bool: module_name = os.path.basename(module_path) if os.path.isfile(module_path): sp = os.path.splitext(module_path) @@ -1911,9 +1936,10 @@ def load_custom_node(module_path, ignore=set()): EXTENSION_WEB_DIRS[module_name] = web_dir if hasattr(module, "NODE_CLASS_MAPPINGS") and getattr(module, "NODE_CLASS_MAPPINGS") is not None: - for name in module.NODE_CLASS_MAPPINGS: + for name, node_cls in module.NODE_CLASS_MAPPINGS.items(): if name not in ignore: - NODE_CLASS_MAPPINGS[name] = module.NODE_CLASS_MAPPINGS[name] + NODE_CLASS_MAPPINGS[name] = node_cls + node_cls.RELATIVE_PYTHON_MODULE = "{}.{}".format(module_parent, get_module_name(module_path)) if hasattr(module, "NODE_DISPLAY_NAME_MAPPINGS") and getattr(module, "NODE_DISPLAY_NAME_MAPPINGS") is not None: NODE_DISPLAY_NAME_MAPPINGS.update(module.NODE_DISPLAY_NAME_MAPPINGS) return True @@ -1925,7 +1951,16 @@ def load_custom_node(module_path, ignore=set()): logging.warning(f"Cannot import {module_path} module for custom nodes: {e}") return False -def load_custom_nodes(): +def init_external_custom_nodes(): + """ + Initializes the external custom nodes. + + This function loads custom nodes from the specified folder paths and imports them into the application. + It measures the import times for each custom node and logs the results. + + Returns: + None + """ base_node_names = set(NODE_CLASS_MAPPINGS.keys()) node_paths = folder_paths.get_folder_paths("custom_nodes") node_import_times = [] @@ -1939,7 +1974,7 @@ def load_custom_nodes(): if os.path.isfile(module_path) and os.path.splitext(module_path)[1] != ".py": continue if module_path.endswith(".disabled"): continue time_before = time.perf_counter() - success = load_custom_node(module_path, base_node_names) + success = load_custom_node(module_path, base_node_names, module_parent="custom_nodes") node_import_times.append((time.perf_counter() - time_before, module_path, success)) if len(node_import_times) > 0: @@ -1952,7 +1987,16 @@ def load_custom_nodes(): logging.info("{:6.1f} seconds{}: {}".format(n[0], import_message, n[1])) logging.info("") -def init_custom_nodes(): +def init_builtin_extra_nodes(): + """ + Initializes the built-in extra nodes in ComfyUI. + + This function loads the extra node files located in the "comfy_extras" directory and imports them into ComfyUI. + If any of the extra node files fail to import, a warning message is logged. + + Returns: + None + """ extras_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "comfy_extras") extras_files = [ "nodes_latent.py", @@ -1992,14 +2036,24 @@ def init_custom_nodes(): "nodes_audio.py", "nodes_sd3.py", "nodes_gits.py", + "nodes_controlnet.py", ] import_failed = [] for node_file in extras_files: - if not load_custom_node(os.path.join(extras_dir, node_file)): + if not load_custom_node(os.path.join(extras_dir, node_file), module_parent="comfy_extras"): import_failed.append(node_file) - load_custom_nodes() + return import_failed + + +def init_extra_nodes(init_custom_nodes=True): + import_failed = init_builtin_extra_nodes() + + if init_custom_nodes: + init_external_custom_nodes() + else: + logging.info("Skipping loading of custom nodes") if len(import_failed) > 0: logging.warning("WARNING: some comfy_extras/ nodes did not import correctly. This may be because they are missing some dependencies.\n") diff --git a/pytest.ini b/pytest.ini index b5a68e0f12fe..8b7a747e7603 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,5 +1,8 @@ [pytest] markers = inference: mark as inference test (deselect with '-m "not inference"') -testpaths = tests -addopts = -s \ No newline at end of file +testpaths = + tests + tests-unit +addopts = -s +pythonpath = . diff --git a/requirements.txt b/requirements.txt index 108958d2f435..4c2c0b2b2215 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,9 @@ torchsde torchvision torchaudio einops -transformers>=4.25.1 +transformers>=4.28.1 +tokenizers>=0.13.3 +sentencepiece safetensors>=0.4.2 aiohttp pyyaml diff --git a/server.py b/server.py index 30ea90c6c91c..23ca2fd33334 100644 --- a/server.py +++ b/server.py @@ -12,6 +12,7 @@ import glob import struct import ssl +import hashlib from PIL import Image, ImageOps from PIL.PngImagePlugin import PngInfo from io import BytesIO @@ -24,9 +25,11 @@ from comfy.cli_args import args import comfy.utils import comfy.model_management - +import node_helpers +from app.frontend_management import FrontendManager from app.user_manager import UserManager + class BinaryEventTypes: PREVIEW_IMAGE = 1 UNENCODED_PREVIEW_IMAGE = 2 @@ -82,8 +85,12 @@ def __init__(self, loop): max_upload_size = round(args.max_upload_size * 1024 * 1024) self.app = web.Application(client_max_size=max_upload_size, middlewares=middlewares) self.sockets = dict() - self.web_root = os.path.join(os.path.dirname( - os.path.realpath(__file__)), "web") + self.web_root = ( + FrontendManager.init_frontend(args.front_end_version) + if args.front_end_root is None + else args.front_end_root + ) + logging.info(f"[Prompt Server] web root: {self.web_root}") routes = web.RouteTableDef() self.routes = routes self.last_node_id = None @@ -110,7 +117,7 @@ async def websocket_handler(request): # On reconnect if we are the currently executing client send the current node if self.client_id == sid and self.last_node_id is not None: await self.send("executing", { "node": self.last_node_id }, sid) - + async for msg in ws: if msg.type == aiohttp.WSMsgType.ERROR: logging.warning('ws connection closed with exception %s' % ws.exception()) @@ -131,9 +138,9 @@ def get_embeddings(self): async def get_extensions(request): files = glob.glob(os.path.join( glob.escape(self.web_root), 'extensions/**/*.js'), recursive=True) - + extensions = list(map(lambda f: "/" + os.path.relpath(f, self.web_root).replace("\\", "/"), files)) - + for name, dir in nodes.EXTENSION_WEB_DIRS.items(): files = glob.glob(os.path.join(glob.escape(dir), '**/*.js'), recursive=True) extensions.extend(list(map(lambda f: "/extensions/" + urllib.parse.quote( @@ -154,9 +161,25 @@ def get_dir_by_type(dir_type): return type_dir, dir_type + def compare_image_hash(filepath, image): + hasher = node_helpers.hasher() + + # function to compare hashes of two images to see if it already exists, fix to #3465 + if os.path.exists(filepath): + a = hasher() + b = hasher() + with open(filepath, "rb") as f: + a.update(f.read()) + b.update(image.file.read()) + image.file.seek(0) + f.close() + return a.hexdigest() == b.hexdigest() + return False + def image_upload(post, image_save_function=None): image = post.get("image") overwrite = post.get("overwrite") + image_is_duplicate = False image_upload_type = post.get("type") upload_dir, image_upload_type = get_dir_by_type(image_upload_type) @@ -183,15 +206,19 @@ def image_upload(post, image_save_function=None): else: i = 1 while os.path.exists(filepath): + if compare_image_hash(filepath, image): #compare hash to prevent saving of duplicates with same name, fix for #3465 + image_is_duplicate = True + break filename = f"{split[0]} ({i}){split[1]}" filepath = os.path.join(full_output_folder, filename) i += 1 - if image_save_function is not None: - image_save_function(image, post, filepath) - else: - with open(filepath, "wb") as f: - f.write(image.file.read()) + if not image_is_duplicate: + if image_save_function is not None: + image_save_function(image, post, filepath) + else: + with open(filepath, "wb") as f: + f.write(image.file.read()) return web.json_response({"name" : filename, "subfolder": subfolder, "type": image_upload_type}) else: @@ -397,6 +424,7 @@ def node_info(node_class): info['name'] = node_class info['display_name'] = nodes.NODE_DISPLAY_NAME_MAPPINGS[node_class] if node_class in nodes.NODE_DISPLAY_NAME_MAPPINGS.keys() else node_class info['description'] = obj_class.DESCRIPTION if hasattr(obj_class,'DESCRIPTION') else '' + info['python_module'] = getattr(obj_class, "RELATIVE_PYTHON_MODULE", "nodes") info['category'] = 'sd' if hasattr(obj_class, 'OUTPUT_NODE') and obj_class.OUTPUT_NODE == True: info['output_node'] = True diff --git a/tests-unit/README.md b/tests-unit/README.md new file mode 100644 index 000000000000..94abd9853468 --- /dev/null +++ b/tests-unit/README.md @@ -0,0 +1,8 @@ +# Pytest Unit Tests + +## Install test dependencies + +`pip install -r tests-units/requirements.txt` + +## Run tests +`pytest tests-units/` diff --git a/tests-unit/app_test/__init__.py b/tests-unit/app_test/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests-unit/app_test/frontend_manager_test.py b/tests-unit/app_test/frontend_manager_test.py new file mode 100644 index 000000000000..637869cfbf55 --- /dev/null +++ b/tests-unit/app_test/frontend_manager_test.py @@ -0,0 +1,100 @@ +import argparse +import pytest +from requests.exceptions import HTTPError + +from app.frontend_management import ( + FrontendManager, + FrontEndProvider, + Release, +) +from comfy.cli_args import DEFAULT_VERSION_STRING + + +@pytest.fixture +def mock_releases(): + return [ + Release( + id=1, + tag_name="1.0.0", + name="Release 1.0.0", + prerelease=False, + created_at="2022-01-01T00:00:00Z", + published_at="2022-01-01T00:00:00Z", + body="Release notes for 1.0.0", + assets=[{"name": "dist.zip", "url": "https://example.com/dist.zip"}], + ), + Release( + id=2, + tag_name="2.0.0", + name="Release 2.0.0", + prerelease=False, + created_at="2022-02-01T00:00:00Z", + published_at="2022-02-01T00:00:00Z", + body="Release notes for 2.0.0", + assets=[{"name": "dist.zip", "url": "https://example.com/dist.zip"}], + ), + ] + + +@pytest.fixture +def mock_provider(mock_releases): + provider = FrontEndProvider( + owner="test-owner", + repo="test-repo", + ) + provider.all_releases = mock_releases + provider.latest_release = mock_releases[1] + FrontendManager.PROVIDERS = [provider] + return provider + + +def test_get_release(mock_provider, mock_releases): + version = "1.0.0" + release = mock_provider.get_release(version) + assert release == mock_releases[0] + + +def test_get_release_latest(mock_provider, mock_releases): + version = "latest" + release = mock_provider.get_release(version) + assert release == mock_releases[1] + + +def test_get_release_invalid_version(mock_provider): + version = "invalid" + with pytest.raises(ValueError): + mock_provider.get_release(version) + + +def test_init_frontend_default(): + version_string = DEFAULT_VERSION_STRING + frontend_path = FrontendManager.init_frontend(version_string) + assert frontend_path == FrontendManager.DEFAULT_FRONTEND_PATH + + +def test_init_frontend_invalid_version(): + version_string = "test-owner/test-repo@1.100.99" + with pytest.raises(HTTPError): + FrontendManager.init_frontend_unsafe(version_string) + + +def test_init_frontend_invalid_provider(): + version_string = "invalid/invalid@latest" + with pytest.raises(HTTPError): + FrontendManager.init_frontend_unsafe(version_string) + + +def test_parse_version_string(): + version_string = "owner/repo@1.0.0" + repo_owner, repo_name, version = FrontendManager.parse_version_string( + version_string + ) + assert repo_owner == "owner" + assert repo_name == "repo" + assert version == "1.0.0" + + +def test_parse_version_string_invalid(): + version_string = "invalid" + with pytest.raises(argparse.ArgumentTypeError): + FrontendManager.parse_version_string(version_string) diff --git a/tests-unit/requirements.txt b/tests-unit/requirements.txt new file mode 100644 index 000000000000..0587502f877a --- /dev/null +++ b/tests-unit/requirements.txt @@ -0,0 +1 @@ +pytest>=7.8.0 diff --git a/web/extensions/core/uploadAudio.js b/web/extensions/core/uploadAudio.js index d3dfef4a7e80..9dfa029bf681 100644 --- a/web/extensions/core/uploadAudio.js +++ b/web/extensions/core/uploadAudio.js @@ -17,7 +17,6 @@ function getResourceURL(subfolder, filename, type = "input") { "filename=" + encodeURIComponent(filename), "type=" + type, "subfolder=" + subfolder, - app.getPreviewFormatParam().substring(1), app.getRandParam().substring(1) ].join("&") @@ -70,7 +69,7 @@ async function uploadFile( app.registerExtension({ name: "Comfy.AudioWidget", async beforeRegisterNodeDef(nodeType, nodeData) { - if (["LoadAudio", "SaveAudio"].includes(nodeType.comfyClass)) { + if (["LoadAudio", "SaveAudio", "PreviewAudio"].includes(nodeType.comfyClass)) { nodeData.input.required.audioUI = ["AUDIO_UI"] } }, @@ -103,7 +102,7 @@ app.registerExtension({ if (!audios) return const audio = audios[0] audioUIWidget.element.src = api.apiURL( - getResourceURL(audio.subfolder, audio.filename, "output") + getResourceURL(audio.subfolder, audio.filename, audio.type) ) audioUIWidget.element.classList.remove("empty-audio-widget") } @@ -118,7 +117,7 @@ app.registerExtension({ if ("audio" in output) { const audioUIWidget = node.widgets.find((w) => w.name === "audioUI"); const audio = output.audio[0]; - audioUIWidget.element.src = api.apiURL(getResourceURL(audio.subfolder, audio.filename, "output")); + audioUIWidget.element.src = api.apiURL(getResourceURL(audio.subfolder, audio.filename, audio.type)); audioUIWidget.element.classList.remove("empty-audio-widget"); } } @@ -145,9 +144,20 @@ app.registerExtension({ ) } // Initially load default audio file to audioUIWidget. - onAudioWidgetUpdate() + if (audioWidget.value) { + onAudioWidgetUpdate() + } audioWidget.callback = onAudioWidgetUpdate + // Load saved audio file widget values if restoring from workflow + const onGraphConfigured = node.onGraphConfigured; + node.onGraphConfigured = function() { + onGraphConfigured?.apply(this, arguments) + if (audioWidget.value) { + onAudioWidgetUpdate() + } + } + const fileInput = document.createElement("input") fileInput.type = "file" fileInput.accept = "audio/*" diff --git a/web/scripts/api.js b/web/scripts/api.js index 39f0a9bb27e7..03c3fb607ec1 100644 --- a/web/scripts/api.js +++ b/web/scripts/api.js @@ -136,6 +136,9 @@ class ComfyApi extends EventTarget { case "execution_start": this.dispatchEvent(new CustomEvent("execution_start", { detail: msg.data })); break; + case "execution_success": + this.dispatchEvent(new CustomEvent("execution_success", { detail: msg.data })); + break; case "execution_error": this.dispatchEvent(new CustomEvent("execution_error", { detail: msg.data })); break; diff --git a/web/scripts/app.js b/web/scripts/app.js index 43df610657dd..8b4478a322a1 100644 --- a/web/scripts/app.js +++ b/web/scripts/app.js @@ -1084,7 +1084,7 @@ export class ComfyApp { if (e.type == "keydown" && !e.repeat) { // Ctrl + M mute/unmute - if (e.key === 'm' && e.ctrlKey) { + if (e.key === 'm' && (e.metaKey || e.ctrlKey)) { if (this.selected_nodes) { for (var i in this.selected_nodes) { if (this.selected_nodes[i].mode === 2) { // never @@ -1098,7 +1098,7 @@ export class ComfyApp { } // Ctrl + B bypass - if (e.key === 'b' && e.ctrlKey) { + if (e.key === 'b' && (e.metaKey || e.ctrlKey)) { if (this.selected_nodes) { for (var i in this.selected_nodes) { if (this.selected_nodes[i].mode === 4) { // never @@ -1599,7 +1599,7 @@ export class ComfyApp { if (json) { const workflow = JSON.parse(json); const workflowName = getStorageValue("Comfy.PreviousWorkflow"); - await this.loadGraphData(workflow, true, workflowName); + await this.loadGraphData(workflow, true, true, workflowName); return true; } }; @@ -1966,6 +1966,14 @@ export class ComfyApp { if (widget.value.startsWith("sample_")) { widget.value = widget.value.slice(7); } + if (widget.value === "euler_pp" || widget.value === "euler_ancestral_pp") { + widget.value = widget.value.slice(0, -3); + for (let w of node.widgets) { + if (w.name == "cfg") { + w.value *= 2.0; + } + } + } } } if (node.type == "KSampler" || node.type == "KSamplerAdvanced" || node.type == "PrimitiveNode") { @@ -2284,7 +2292,7 @@ export class ComfyApp { } else { this.showErrorOnFileLoad(file); } - } else if (file.type === "audio/flac") { + } else if (file.type === "audio/flac" || file.type === "audio/x-flac") { const pngInfo = await getFlacMetadata(file); // Support loading workflows from that webp custom node. const workflow = pngInfo?.workflow; @@ -2306,14 +2314,14 @@ export class ComfyApp { } else if(this.isApiJson(jsonContent)) { this.loadApiJson(jsonContent, fileName); } else { - await this.loadGraphData(jsonContent, true, fileName); + await this.loadGraphData(jsonContent, true, true, fileName); } }; reader.readAsText(file); } else if (file.name?.endsWith(".latent") || file.name?.endsWith(".safetensors")) { const info = await getLatentMetadata(file); if (info.workflow) { - await this.loadGraphData(JSON.parse(info.workflow), true, fileName); + await this.loadGraphData(JSON.parse(info.workflow), true, true, fileName); } else if (info.prompt) { this.loadApiJson(JSON.parse(info.prompt)); } else { diff --git a/web/scripts/changeTracker.js b/web/scripts/changeTracker.js index 59901d5fcd1c..39bc4a8104b1 100644 --- a/web/scripts/changeTracker.js +++ b/web/scripts/changeTracker.js @@ -3,7 +3,6 @@ import { api } from "./api.js"; import { clone } from "./utils.js"; - export class ChangeTracker { static MAX_HISTORY = 50; #app; @@ -170,6 +169,19 @@ export class ChangeTracker { return v; }; + // Detects nodes being added via the node search dialog + const onNodeAdded = LiteGraph.LGraph.prototype.onNodeAdded; + LiteGraph.LGraph.prototype.onNodeAdded = function () { + const v = onNodeAdded?.apply(this, arguments); + if (!app?.configuringGraph) { + const ct = changeTracker(); + if (!ct.isOurLoad) { + ct.checkState(); + } + } + return v; + }; + // Store node outputs api.addEventListener("executed", ({ detail }) => { const prompt = app.workflowManager.queuedPrompts[detail.prompt_id]; diff --git a/web/scripts/pnginfo.js b/web/scripts/pnginfo.js index 2c03cf74a4e3..8b1b2c61c411 100644 --- a/web/scripts/pnginfo.js +++ b/web/scripts/pnginfo.js @@ -49,7 +49,7 @@ export function getPngMetadata(file) { function parseExifData(exifData) { // Check for the correct TIFF header (0x4949 for little-endian or 0x4D4D for big-endian) - const isLittleEndian = new Uint16Array(exifData.slice(0, 2))[0] === 0x4949; + const isLittleEndian = String.fromCharCode(...exifData.slice(0, 2)) === "II"; // Function to read 16-bit and 32-bit integers from binary data function readInt(offset, isLittleEndian, length) { @@ -134,6 +134,7 @@ export function getWebpMetadata(file) { let index = value.indexOf(':'); txt_chunks[value.slice(0, index)] = value.slice(index + 1); } + break; } offset += 8 + chunk_length; diff --git a/web/scripts/ui/menu/menu.css b/web/scripts/ui/menu/menu.css index 20eeab2cf576..afaed3fb0fb5 100644 --- a/web/scripts/ui/menu/menu.css +++ b/web/scripts/ui/menu/menu.css @@ -19,8 +19,12 @@ padding: 4px 8px; box-sizing: border-box; margin: 0; + transition: box-shadow 0.1s; } +.comfyui-button:active { + box-shadow: inset 1px 1px 10px rgba(0, 0, 0, 0.5); +} .comfyui-button:disabled { opacity: 0.5; cursor: not-allowed; diff --git a/web/scripts/ui/menu/queueButton.js b/web/scripts/ui/menu/queueButton.js index 3c29ab090c03..608f4cc9b00c 100644 --- a/web/scripts/ui/menu/queueButton.js +++ b/web/scripts/ui/menu/queueButton.js @@ -13,8 +13,8 @@ export class ComfyQueueButton { queuePrompt = async (e) => { this.#internalQueueSize += this.queueOptions.batchCount; - // Hold shift to queue front - await this.app.queuePrompt(-e.shiftKey, this.queueOptions.batchCount); + // Hold shift to queue front, event is undefined when auto-queue is enabled + await this.app.queuePrompt(e?.shiftKey ? -1 : 0, this.queueOptions.batchCount); }; constructor(app) { diff --git a/web/scripts/ui/menu/workflows.js b/web/scripts/ui/menu/workflows.js index afdff538a221..3b904fb4bbf8 100644 --- a/web/scripts/ui/menu/workflows.js +++ b/web/scripts/ui/menu/workflows.js @@ -182,6 +182,11 @@ export class ComfyWorkflowsMenu { * @param {ComfyWorkflow} workflow */ async function sendToWorkflow(img, workflow) { + const openWorkflow = app.workflowManager.openWorkflows.find((w) => w.path === workflow.path); + if (openWorkflow) { + workflow = openWorkflow; + } + await workflow.load(); let options = []; const nodes = app.graph.computeExecutionOrder(false); @@ -214,7 +219,8 @@ export class ComfyWorkflowsMenu { nodeType.prototype["getExtraMenuOptions"] = function (_, options) { const r = getExtraMenuOptions?.apply?.(this, arguments); - if (app.ui.settings.getSettingValue("Comfy.UseNewMenu", false) === true) { + const setting = app.ui.settings.getSettingValue("Comfy.UseNewMenu", false); + if (setting && setting != "Disabled") { const t = /** @type { {imageIndex?: number, overIndex?: number, imgs: string[]} } */ /** @type {any} */ (this); let img; if (t.imageIndex != null) { diff --git a/web/scripts/workflows.js b/web/scripts/workflows.js index 16bbc9976337..d38b6f5fc0a6 100644 --- a/web/scripts/workflows.js +++ b/web/scripts/workflows.js @@ -301,11 +301,11 @@ export class ComfyWorkflow { load = async () => { if (this.isOpen) { - await this.manager.app.loadGraphData(this.changeTracker.activeState, true, this); + await this.manager.app.loadGraphData(this.changeTracker.activeState, true, true, this); } else { const data = await this.getWorkflowData(); if (!data) return; - await this.manager.app.loadGraphData(data, true, this); + await this.manager.app.loadGraphData(data, true, true, this); } }; diff --git a/web/style.css b/web/style.css index e983b652a713..8ef1d0dd101a 100644 --- a/web/style.css +++ b/web/style.css @@ -41,7 +41,7 @@ body { background-color: var(--bg-color); color: var(--fg-color); grid-template-columns: auto 1fr auto; - grid-template-rows: auto auto 1fr auto; + grid-template-rows: auto 1fr auto; min-height: -webkit-fill-available; max-height: -webkit-fill-available; min-width: -webkit-fill-available; @@ -49,32 +49,37 @@ body { } .comfyui-body-top { - order: 0; + order: -5; grid-column: 1/-1; z-index: 10; + display: flex; + flex-direction: column; } .comfyui-body-left { - order: 1; + order: -4; z-index: 10; + display: flex; } #graph-canvas { width: 100%; height: 100%; - order: 2; - grid-column: 1/-1; + order: -3; } .comfyui-body-right { - order: 3; + order: -2; z-index: 10; + display: flex; } .comfyui-body-bottom { - order: 4; + order: -1; grid-column: 1/-1; z-index: 10; + display: flex; + flex-direction: column; } .comfy-multiline-input { @@ -408,8 +413,12 @@ dialog::backdrop { background: rgba(0, 0, 0, 0.5); } -.comfy-dialog.comfyui-dialog { +.comfy-dialog.comfyui-dialog.comfy-modal { top: 0; + left: 0; + right: 0; + bottom: 0; + transform: none; } .comfy-dialog.comfy-modal { diff --git a/workflow.json b/workflow.json new file mode 100644 index 000000000000..4fc3ee38bc8b --- /dev/null +++ b/workflow.json @@ -0,0 +1,708 @@ +{ + "last_node_id": 24, + "last_link_id": 24, + "nodes": [ + { + "id": 7, + "type": "CLIPTextEncode", + "pos": [ + 413, + 389 + ], + "size": { + "0": 425.27801513671875, + "1": 180.6060791015625 + }, + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "name": "clip", + "type": "CLIP", + "link": 14, + "slot_index": 0 + } + ], + "outputs": [ + { + "name": "CONDITIONING", + "type": "CONDITIONING", + "links": [ + 6 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "CLIPTextEncode" + }, + "widgets_values": [ + "text, watermark" + ] + }, + { + "id": 4, + "type": "CheckpointLoaderSimple", + "pos": [ + 26, + 474 + ], + "size": { + "0": 315, + "1": 98 + }, + "flags": {}, + "order": 0, + "mode": 0, + "outputs": [ + { + "name": "MODEL", + "type": "MODEL", + "links": [], + "slot_index": 0 + }, + { + "name": "CLIP", + "type": "CLIP", + "links": [], + "slot_index": 1 + }, + { + "name": "VAE", + "type": "VAE", + "links": [], + "slot_index": 2 + } + ], + "properties": { + "Node name for S&R": "CheckpointLoaderSimple" + }, + "widgets_values": [ + "sd_xl_turbo_1.0.safetensors" + ] + }, + { + "id": 15, + "type": "CLIPVisionEncode", + "pos": [ + 500, + 35 + ], + "size": { + "0": 380.4000244140625, + "1": 46 + }, + "flags": {}, + "order": 9, + "mode": 0, + "inputs": [ + { + "name": "clip_vision", + "type": "CLIP_VISION", + "link": 17 + }, + { + "name": "image", + "type": "IMAGE", + "link": 18 + } + ], + "outputs": [ + { + "name": "CLIP_VISION_OUTPUT", + "type": "CLIP_VISION_OUTPUT", + "links": null, + "shape": 3, + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "CLIPVisionEncode" + } + }, + { + "id": 18, + "type": "SaveImage", + "pos": [ + 1612, + 478 + ], + "size": { + "0": 210, + "1": 270 + }, + "flags": {}, + "order": 12, + "mode": 0, + "inputs": [ + { + "name": "images", + "type": "IMAGE", + "link": 20, + "slot_index": 0 + } + ], + "properties": {}, + "widgets_values": [ + "ComfyUI" + ] + }, + { + "id": 3, + "type": "KSampler", + "pos": [ + 863, + 186 + ], + "size": { + "0": 315, + "1": 262 + }, + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "name": "model", + "type": "MODEL", + "link": 15, + "slot_index": 0 + }, + { + "name": "positive", + "type": "CONDITIONING", + "link": 4 + }, + { + "name": "negative", + "type": "CONDITIONING", + "link": 6 + }, + { + "name": "latent_image", + "type": "LATENT", + "link": 2 + } + ], + "outputs": [ + { + "name": "LATENT", + "type": "LATENT", + "links": [ + 7, + 21 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "KSampler" + }, + "widgets_values": [ + 228380521887091, + "randomize", + 20, + 8, + "euler", + "normal", + 1 + ] + }, + { + "id": 8, + "type": "VAEDecode", + "pos": [ + 1209, + 188 + ], + "size": { + "0": 210, + "1": 46 + }, + "flags": {}, + "order": 6, + "mode": 0, + "inputs": [ + { + "name": "samples", + "type": "LATENT", + "link": 7 + }, + { + "name": "vae", + "type": "VAE", + "link": 16, + "slot_index": 1 + } + ], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 12, + 18, + 19 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "VAEDecode" + } + }, + { + "id": 14, + "type": "unCLIPCheckpointLoader", + "pos": [ + 12, + 134 + ], + "size": { + "0": 315, + "1": 118 + }, + "flags": {}, + "order": 1, + "mode": 0, + "outputs": [ + { + "name": "MODEL", + "type": "MODEL", + "links": [ + 15 + ], + "shape": 3 + }, + { + "name": "CLIP", + "type": "CLIP", + "links": [ + 13, + 14 + ], + "shape": 3 + }, + { + "name": "VAE", + "type": "VAE", + "links": [ + 16, + 23 + ], + "shape": 3, + "slot_index": 2 + }, + { + "name": "CLIP_VISION", + "type": "CLIP_VISION", + "links": [ + 17 + ], + "shape": 3, + "slot_index": 3 + } + ], + "properties": { + "Node name for S&R": "unCLIPCheckpointLoader" + }, + "widgets_values": [ + "sd_xl_turbo_1.0.safetensors" + ] + }, + { + "id": 23, + "type": "VAEDecode", + "pos": [ + 846, + 826 + ], + "size": { + "0": 210, + "1": 46 + }, + "flags": {}, + "order": 11, + "mode": 0, + "inputs": [ + { + "name": "samples", + "type": "LATENT", + "link": 22 + }, + { + "name": "vae", + "type": "VAE", + "link": 23, + "slot_index": 1 + } + ], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 24 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "VAEDecode" + } + }, + { + "id": 9, + "type": "SaveImage", + "pos": [ + 1600, + 86 + ], + "size": { + "0": 210, + "1": 270 + }, + "flags": {}, + "order": 8, + "mode": 0, + "inputs": [ + { + "name": "images", + "type": "IMAGE", + "link": 12, + "slot_index": 0 + } + ], + "properties": {}, + "widgets_values": [ + "ComfyUI" + ] + }, + { + "id": 24, + "type": "SaveImage", + "pos": [ + 1346, + 579 + ], + "size": { + "0": 210, + "1": 270 + }, + "flags": {}, + "order": 13, + "mode": 0, + "inputs": [ + { + "name": "images", + "type": "IMAGE", + "link": 24, + "slot_index": 0 + } + ], + "properties": {}, + "widgets_values": [ + "ComfyUI" + ] + }, + { + "id": 5, + "type": "EmptyLatentImage", + "pos": [ + 473, + 609 + ], + "size": { + "0": 315, + "1": 106 + }, + "flags": {}, + "order": 2, + "mode": 0, + "outputs": [ + { + "name": "LATENT", + "type": "LATENT", + "links": [ + 2 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "EmptyLatentImage" + }, + "widgets_values": [ + 512, + 512, + 9 + ] + }, + { + "id": 6, + "type": "CLIPTextEncode", + "pos": [ + 415, + 186 + ], + "size": { + "0": 422.84503173828125, + "1": 164.31304931640625 + }, + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "name": "clip", + "type": "CLIP", + "link": 13, + "slot_index": 0 + } + ], + "outputs": [ + { + "name": "CONDITIONING", + "type": "CONDITIONING", + "links": [ + 4 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "CLIPTextEncode" + }, + "widgets_values": [ + "pizza" + ] + }, + { + "id": 17, + "type": "SafetyFilter", + "pos": [ + 1229, + 351 + ], + "size": { + "0": 315, + "1": 130 + }, + "flags": {}, + "order": 10, + "mode": 0, + "inputs": [ + { + "name": "image", + "type": "IMAGE", + "link": 19 + } + ], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 20 + ], + "shape": 3, + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "SafetyFilter" + }, + "widgets_values": [ + "food", + 0.17, + 0, + "enable" + ] + }, + { + "id": 22, + "type": "LatentSafetyFilter", + "pos": [ + 964, + 540 + ], + "size": { + "0": 315, + "1": 130 + }, + "flags": {}, + "order": 7, + "mode": 0, + "inputs": [ + { + "name": "samples", + "type": "LATENT", + "link": 21 + } + ], + "outputs": [ + { + "name": "LATENT", + "type": "LATENT", + "links": [ + 22 + ], + "shape": 3, + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "LatentSafetyFilter" + }, + "widgets_values": [ + "food", + 0.667, + 0, + "enable" + ] + } + ], + "links": [ + [ + 2, + 5, + 0, + 3, + 3, + "LATENT" + ], + [ + 4, + 6, + 0, + 3, + 1, + "CONDITIONING" + ], + [ + 6, + 7, + 0, + 3, + 2, + "CONDITIONING" + ], + [ + 7, + 3, + 0, + 8, + 0, + "LATENT" + ], + [ + 12, + 8, + 0, + 9, + 0, + "IMAGE" + ], + [ + 13, + 14, + 1, + 6, + 0, + "CLIP" + ], + [ + 14, + 14, + 1, + 7, + 0, + "CLIP" + ], + [ + 15, + 14, + 0, + 3, + 0, + "MODEL" + ], + [ + 16, + 14, + 2, + 8, + 1, + "VAE" + ], + [ + 17, + 14, + 3, + 15, + 0, + "CLIP_VISION" + ], + [ + 18, + 8, + 0, + 15, + 1, + "IMAGE" + ], + [ + 19, + 8, + 0, + 17, + 0, + "IMAGE" + ], + [ + 20, + 17, + 0, + 18, + 0, + "IMAGE" + ], + [ + 21, + 3, + 0, + 22, + 0, + "LATENT" + ], + [ + 22, + 22, + 0, + 23, + 0, + "LATENT" + ], + [ + 23, + 14, + 2, + 23, + 1, + "VAE" + ], + [ + 24, + 23, + 0, + 24, + 0, + "IMAGE" + ] + ], + "groups": [], + "config": {}, + "extra": {}, + "version": 0.4 + } \ No newline at end of file