FixLogo

NaruseMioShirakana · Jun 1, 2024 · 1aba099 · 1aba099
1 parent cb23193
commit 1aba099
Show file tree

Hide file tree

Showing 11 changed files with 2,426 additions and 21 deletions.
diff --git a/CSharpDemo/Program.cs b/CSharpDemo/Program.cs
@@ -1,6 +1,5 @@
 using LibSvcApi;
 
-
 LibSvc.LibSvcHparams Config = new();
 Config.TensorExtractor = "DiffusionSvc";
 Config.SamplingRate = 44100;

diff --git a/CSharpDemo/README.md b/CSharpDemo/README.md
diff --git a/CSharpDemo/README_en.md b/CSharpDemo/README_en.md
diff --git a/README.md b/README.md
@@ -1,5 +1,6 @@
 <div align="center">
 
+![image](logo/logo256(AIGen).png)
 # DragonianVoice
 [中文](README.md) | [English](README_en.md)
 

diff --git a/fish-speech.cpp/include/llama.h b/fish-speech.cpp/include/llama.h
@@ -67,15 +67,38 @@ class Attention : public Module
 		bool _Inplace = false
 		);
 
+	bool training = false;
+
+private:
 	ggml_tensor* apply_rotary_emb(
 		ggml_tensor* x,
 		ggml_tensor* freqs_cis,
 		ggml_context* _Ctx,
 		bool _Inplace = false
 	);
 
-private:
+	static ggml_tensor* scaled_dot_product_attention(
+		ggml_tensor* query,
+		ggml_tensor* key,
+		ggml_tensor* value,
+		ggml_context* _Ctx,
+		ggml_tensor* attn_mask = nullptr,
+		float dropout_p = 0.0,
+		bool _Inplace = false
+	);
+
+	static ggml_tensor* eq_scaled_dot_product_attention(
+		ggml_tensor* query,
+		ggml_tensor* key,
+		ggml_tensor* value,
+		ggml_context* _Ctx,
+		ggml_tensor* attn_mask = nullptr,
+		float dropout_p = 0.0,
+		bool _Inplace = false
+	);
+
 	void DumpCurrentLayerInfo(std::wstring& _Tmp) override;
+
 	int total_head_dim;
 	Linear wqkv, wo;
 	float dropout;

diff --git a/fish-speech.cpp/src/llama.cpp b/fish-speech.cpp/src/llama.cpp
@@ -1,6 +1,7 @@
 #include "llama.h"
 
 LibTTSBegin
+inline void UnUsedPtr(void*) {}
 
 RMSNorm::RMSNorm(Module* _Parent, const std::wstring& _Name, SizeType dim, float eps) :
 	Module(_Parent, _Name),
@@ -136,21 +137,80 @@ ggml_tensor* Attention::operator()(
 	int Scale = n_head / n_local_heads;
 	if(Scale > 1)
 	{
-		K = ggml_repeat(_Ctx, K, ggml_new_tensor_4d(_Ctx, K->type, K->ne[0], K->ne[1], K->ne[2] * Scale, K->ne[3]));
-		V = ggml_repeat(_Ctx, V, ggml_new_tensor_4d(_Ctx, V->type, V->ne[0], V->ne[1], V->ne[2] * Scale, V->ne[3]));
+		K = ggml_repeat(_Ctx, K, ggml_new_tensor_4d(_Ctx, K->type, K->ne[0], K->ne[1] * Scale, K->ne[2], K->ne[3]));
+		V = ggml_repeat(_Ctx, V, ggml_new_tensor_4d(_Ctx, V->type, V->ne[0], V->ne[1] * Scale, V->ne[2], V->ne[3]));
 	}
 
-	//TODO
+	ggml_tensor* y;
+	if(use_sdpa)
+		y = scaled_dot_product_attention(
+			Q,
+			K,
+			V,
+			_Ctx,
+			mask,
+			training ? dropout : 0.f
+		);
+	else
+		y = eq_scaled_dot_product_attention(
+			Q,
+			K,
+			V,
+			_Ctx,
+			mask,
+			training ? dropout : 0.f
+		);
 
-	return nullptr;
+	y = ggml_reshape_3d(_Ctx, ggml_cont(_Ctx, ggml_permute(_Ctx, y, 0, 2, 1, 3)), dim, seqlen, bsz);
+
+	return wo(y, _Ctx, _Inplace);
 }
 
 ggml_tensor* Attention::apply_rotary_emb(ggml_tensor* x, ggml_tensor* freqs_cis, ggml_context* _Ctx, bool _Inplace)
 {
-	//TODO
+	int ndim = int(x->ne[0]) / 2;
+	int64_t n_tokens = x->ne[0] / 2;
+	for (size_t i = 1; i < 4; ++i)
+		n_tokens *= x->ne[i];
+	auto xshape = ggml_reshape_2d(_Ctx, x, 2, n_tokens);
+	auto xshape0 = ggml_view_4d(_Ctx, xshape, ndim, x->ne[1], x->ne[2], x->ne[3], x->nb[1], x->nb[2], x->nb[3], 0);
+	auto xshape1 = ggml_view_4d(_Ctx, xshape, ndim, x->ne[1], x->ne[2], x->ne[3], x->nb[1], x->nb[2], x->nb[3], sizeof(float));
+
 	return nullptr;
 }
 
+ggml_tensor* Attention::scaled_dot_product_attention(
+	ggml_tensor* query,
+	ggml_tensor* key,
+	ggml_tensor* value,
+	ggml_context* _Ctx,
+	ggml_tensor* attn_mask,
+	float dropout_p,
+	bool _Inplace
+)
+{
+	UnUsedPtr(attn_mask);
+	UNUSED(dropout_p);
+	UNUSED(_Inplace);
+	return ggml_flash_attn(_Ctx, query, key, value, false);
+}
+
+ggml_tensor* Attention::eq_scaled_dot_product_attention(
+	ggml_tensor* query,
+	ggml_tensor* key,
+	ggml_tensor* value,
+	ggml_context* _Ctx,
+	ggml_tensor* attn_mask,
+	float dropout_p,
+	bool _Inplace
+)
+{
+	UnUsedPtr(attn_mask);
+	UNUSED(dropout_p);
+	UNUSED(_Inplace);
+	return ggml_flash_attn(_Ctx, query, key, value, false);
+}
+
 void Attention::DumpCurrentLayerInfo(std::wstring& _Tmp)
 {
 	_Tmp += std::format(

diff --git a/fish-speech.cpp/test.py b/fish-speech.cpp/test.py
@@ -1,15 +1,7 @@
 import torch
-
-a = torch.nn.ConvTranspose2d(114, 514, 3)
-def precompute_freqs_cis(seq_len: int, n_elem: int, base: int = 20000):
-    freqs = 1.0 / (
-        base ** (torch.arange(0, n_elem, 2)[: (n_elem // 2)].float() / n_elem)
-    )
-    t = torch.arange(seq_len, device=freqs.device)
-    freqs = torch.outer(t, freqs)
-    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
-    cache = torch.stack([freqs_cis.real, freqs_cis.imag], dim=-1)
-    return cache.to(dtype=torch.bfloat16)
-
-b = precompute_freqs_cis(2000, 4999)
-print(b.size())
+import time
+for i in range(20):
+    a = torch.ones(size=(1, 768, 100000))
+    beg = time.time()
+    a.fill_(i)
+    print(time.time() - beg)