Skip to content

Commit

Permalink
finish demo
Browse files Browse the repository at this point in the history
  • Loading branch information
wuhaozhe committed Nov 2, 2021
1 parent 822393a commit 54d45b4
Show file tree
Hide file tree
Showing 6 changed files with 254 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
**/*.pyc
deepspeech/deepspeech-0.9.2-checkpoint/
**/*.wav
!example/example.wav
**/*.zip
**/*.m4a
**/*.mp4
Expand Down
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,12 @@ conda activate python36
- Download the Basel Face Model. Due to the license agreement of Basel Face Model, you have to download the BFM09 model after submitting an application on its [home page](https://faces.dmi.unibas.ch/bfm/main.php?nav=1-2&id=downloads). After getting the access to BFM data, download "01_MorphableModel.mat" and put it into ./deep_3drecon/BFM subfolder.
- Download Download the Expression Basis provided by [Guo et al](https://github.com/Juyong/3DFace). You can find a link named "CoarseData" in the first row of Introduction part in their repository. Download and unzip the Coarse_Dataset.zip. Put "Exp_Pca.bin" into ./deep_3drecon/BFM subfolder. The expression basis are constructed using [Facewarehouse](http://kunzhou.net/zjugaps/facewarehouse/) data and transferred to BFM topology.
Download the pre-trained [reconstruction network](https://drive.google.com/file/d/176LCdUDxAj7T2awQ5knPMPawq5Q2RUWM/view), unzip it and put "FaceReconModel.pb" into ./deep_3drecon/network subfolder.
- Run `git lfs checkout ./deep_3drecon/BFM/BFM_model_front.mat`
- Download the pretrained [audio2motion model](https://cloud.tsinghua.edu.cn/f/acb6d482a26e4eb8b116/?dl=1), put it into `./audio2motion/model`
- Download the pretrained [texture encoder](https://cloud.tsinghua.edu.cn/f/c60a3466016948c48951/?dl=1) and [render](https://cloud.tsinghua.edu.cn/f/106023055772444f8f15/?dl=1), put it into `./render/model`

#### Run
To run our demo, you need at least one GPU with 11G GPU memory.
```
python demo.py --in_img [*.png] --in_audio [*.wav] --output_path [path]
```
Expand Down Expand Up @@ -53,6 +55,8 @@ def get_style_code(exp, pose):
return np.concatenate((exp_std, diff_exp_std, diff_pose_std))
```

Notice that the pose of each talking face is static in current demo, you can control the pose of face by modifying the coeff_array in demo.py in line 93. The coeff_array has shape of $N * 257$ , where $N$ is framesize, vector of $257$ dimensions has same definition as [deep 3d face reconstruction](https://github.com/microsoft/Deep3DFaceReconstruction), where $254-257$ dim controls the translation, and $224-227$ dim controls euler angles for pose.

------

### Project Overview
Expand Down
249 changes: 249 additions & 0 deletions demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,249 @@
import deep_3drecon
import face_alignment
import cv2
import os
import numpy as np
import librosa
import torch
import math
import audio2motion
import render
import torch.nn as nn
import torchvision
import argparse
from align_img import align_lm68, align

parser = argparse.ArgumentParser()
parser.add_argument("--in_img", help = "input portrait", default='example/example.png', type=str)
parser.add_argument("--in_audio", help="input audio", default='example/example.wav', type=str)
parser.add_argument("--output_path", help="output path of videos", default='output', type=str)
conf = parser.parse_args()

def lm68_2_lm5(in_lm):
lm_idx = np.array([31,37,40,43,46,49,55]) - 1
lm = np.stack([in_lm[lm_idx[0],:],np.mean(in_lm[lm_idx[[1,2]],:],0),np.mean(in_lm[lm_idx[[3,4]],:],0),in_lm[lm_idx[5],:],in_lm[lm_idx[6],:]], axis = 0)
lm = lm[[1,2,0,3,4],:2]
return lm

def recon_texture(uv_file, img_file, out_path):
uv_img = cv2.imread(uv_file).astype(np.int32)
img = cv2.imread(img_file).astype(np.int32)

x = uv_img[:, :, 0].reshape(-1)
y = uv_img[:, :, 1].reshape(-1)
index = y * 256 + x
img = img.reshape(-1, 3)
texture = np.zeros((256 * 256, 3), dtype = np.int32)
texture_count = np.zeros((256 * 256), dtype = np.int32)

np.add.at(texture_count, index, 1)
np.add.at(texture, index, img)
texture_count[texture_count == 0] = 1
texture = texture / np.expand_dims(texture_count, 1)

texture = texture.reshape(256, 256, 3)
cv2.imwrite(out_path, texture)


def read_video(video_path):
cap = cv2.VideoCapture(video_path)
frame_list = []
while cap.isOpened():
ret, frame = cap.read()
if ret == False:
break
frame_list.append(frame)
return np.array(frame_list)

os.environ["CUDA_VISIBLE_DEVICES"]=str(0)
img_path = conf.in_img
audio_path = conf.in_audio
tmp_path = conf.output_path

if len(os.listdir(tmp_path)) != 0:
raise Exception("Output path must be empty")

fa = face_alignment.FaceAlignment(face_alignment.LandmarksType._2D, network_size=4, device='cuda')
face_reconstructor = deep_3drecon.Reconstructor()
im_bgr = cv2.imread(img_path)
im_rgb = im_bgr[:, :, ::-1]
lm68 = fa.get_landmarks(im_rgb)[0]
lm5 = lm68_2_lm5(lm68)
coeff, align_img = face_reconstructor.recon_coeff(np.array([im_bgr]), np.array([lm5]), return_image = True)
# Due to one bug, we use tex_noise.png in our implementation, currently the tex_no_noise.mp4 is not used, which will be fixed further.
face_reconstructor.recon_texture_from_coeff(coeff, align_img, os.path.join(tmp_path, "tex_no_noise.mp4"), tmp_dir = tmp_path)
face_reconstructor.recon_uv_from_coeff(coeff, os.path.join(tmp_path, "uv_src.mp4"), tmp_dir = tmp_path)

align_img = align_img[0]
cv2.imwrite(os.path.join(tmp_path, "align.png"), align_img)
uv_img = read_video(os.path.join(tmp_path, "uv_src.mp4"))[0]
recon_texture(os.path.join(tmp_path, "0.png"), os.path.join(tmp_path, "align.png"), os.path.join(tmp_path, "tex_noise.png"))
tex_img = cv2.imread(os.path.join(tmp_path, "tex_noise.png")).astype(np.int32)

del fa
torch.cuda.empty_cache()

import deepspeech
deepspeech_prob = deepspeech.get_prob(audio_path)
del deepspeech
audio, sr = librosa.load(audio_path, sr = 16000)
audio_energy = librosa.feature.rms(y = audio, frame_length = 512, hop_length = 320, center = False)
audio_energy = np.transpose(audio_energy)
coeff_len = len(deepspeech_prob) // 2
coeff_array = np.tile(coeff, (coeff_len, 1))

y_len = len(coeff_array)
audio_clip_list = []
energy_clip_list = []

def get_sync_data(x, x_dim, y_left, y_right):
x_len = len(x)
x_left = math.floor(y_left * 50 / 25)
x_right = math.floor(y_right * 50 / 25)
pad_len = 80 - x_right + x_left

if pad_len % 2 == 0:
pad_left = pad_len // 2
pad_right = pad_len // 2
else:
pad_left = pad_len // 2
pad_right = pad_len - pad_left

x_left = x_left - pad_left
x_right = x_right + pad_right
if x_left < 0:
if x_right > x_len:
x_data = np.concatenate((np.tile(x[0], -1 * x_left).reshape(-1, x_dim), x, np.tile(x[-1], x_right - x_len).reshape(-1, x_dim)), axis = 0)
else:
x_data = x[0: x_right]
x_data = np.concatenate((np.tile(x[0], -1 * x_left).reshape(-1, x_dim), x_data), axis = 0)
elif x_right > x_len:
x_data = x[x_left: x_len]
x_data = np.concatenate((x_data, np.tile(x[-1], x_right - x_len).reshape(-1, x_dim)), axis = 0)
else:
x_data = x[x_left: x_right]

return x_data

for i in range(0, y_len, 8):
if i > y_len - 32:
y_left = y_len - 32
y_right = y_len
else:
y_left = i
y_right = i + 32

audio_clip = get_sync_data(deepspeech_prob, 29, y_left, y_right)
energy_clip = get_sync_data(audio_energy, 1, y_left, y_right)

audio_clip_list.append(audio_clip)
energy_clip_list.append(energy_clip)

audio_clip_list = np.array(audio_clip_list)
energy_clip_list = np.array(energy_clip_list)

audio_batch = torch.from_numpy(audio_clip_list).transpose(1, 2).float().cuda()
energy_batch = torch.from_numpy(energy_clip_list).transpose(1, 2).float().cuda()
sty_batch = torch.from_numpy(np.load("./example/sty.npy")).float().cuda()

torch.cuda.empty_cache()

# os.environ["CUDA_VISIBLE_DEVICES"]=str(1)
model = audio2motion.StyleFusionModel().cuda()
model.load_state_dict(torch.load("./audio2motion/model/backbone.pkl"), strict = True)
model.eval()

with torch.no_grad():
for idx in range(10):
sty_tmp = sty_batch[idx].unsqueeze(0).repeat(len(audio_batch), 1)
pred_exp_batch = model(audio_batch, energy_batch, sty_tmp)

y_len = len(coeff_array)
y_repeat = torch.zeros(y_len).int().cuda()
predict_exp_cat = torch.zeros((y_len, 64)).float().cuda()
for counter, i in enumerate(range(0, y_len, 8)):
if i > y_len - 32:
y_left = y_len - 32
y_right = y_len
else:
y_left = i
y_right = i + 32
y_repeat[y_left: y_right] += 1
predict_exp_cat[y_left: y_right] += pred_exp_batch[counter].transpose(0, 1)
y_repeat = y_repeat.float()
predict_exp_cat = predict_exp_cat / y_repeat.unsqueeze(1)
coeff_array[:, 80:144] = predict_exp_cat.detach().cpu().numpy()
np.save(os.path.join(tmp_path, "test_{}.npy".format(idx)), coeff_array)
face_reconstructor.recon_uv_from_coeff(coeff_array,
out_path = os.path.join(tmp_path, "test_uv_{}.mp4".format(idx)),
bg_path = os.path.join(tmp_path, "test_bg_{}.mp4".format(idx)),
tmp_dir = tmp_path
)

del model
del face_reconstructor
del audio_batch
del energy_batch
del sty_batch
del coeff_array
del predict_exp_cat
torch.cuda.empty_cache()

tex_encoder = render.UNet(9, 16).cuda()
tex_sampler = render.TexSampler().cuda()
face_unet = render.define_G(16, 3, 64, 'local').cuda()
tex_encoder.load_state_dict(torch.load("./render/model/tex_encoder.pkl"), strict = True)
face_unet.load_state_dict(torch.load("./render/model/face_unet.pkl"), strict = True)
tex_encoder.eval()
tex_sampler.eval()
face_unet.eval()
tex_img = torch.from_numpy(tex_img).float().permute(2, 0, 1) / 128 - 1
tex_img_batch = tex_img.unsqueeze(0).repeat(3, 1, 1, 1).cuda()

with torch.no_grad():
batch_size = 8
for idx in range(10):
uv_path = os.path.join(tmp_path, "test_uv_{}.mp4".format(idx))
bg_path = os.path.join(tmp_path, "test_bg_{}.mp4".format(idx))
bg_frames = np.array(read_video(bg_path))
uv_frames = np.array(read_video(uv_path))
uv_frames = torch.from_numpy(uv_frames).float().permute(0, 3, 1, 2) / 255
uv_img_batch = uv_frames[:, :2]
bg_img_batch = torch.from_numpy(bg_frames > 127).float()[:, :, :, 0].unsqueeze(1)
bg_img_batch, uv_img_batch, tex_img_batch = bg_img_batch.unsqueeze(0), uv_img_batch.unsqueeze(0), tex_img_batch.unsqueeze(0)
tex_img_batch = tex_img_batch.reshape(tex_img_batch.shape[0], -1, tex_img_batch.shape[3], tex_img_batch.shape[4]).cuda()
tex = tex_encoder(tex_img_batch)
pred_img_batch = torch.zeros((uv_img_batch.shape[0], uv_img_batch.shape[1], 3, uv_img_batch.shape[3], uv_img_batch.shape[4])).float()
start_idx = 0
while start_idx < pred_img_batch.shape[1]:
if start_idx + batch_size > pred_img_batch.shape[1]:
end_idx = pred_img_batch.shape[1]
else:
end_idx = start_idx + batch_size
bg_tmp_batch = bg_img_batch[:, start_idx: end_idx].cuda()
uv_tmp_batch = uv_img_batch[:, start_idx: end_idx].cuda()
bg_tmp_batch = bg_tmp_batch.reshape(-1, bg_tmp_batch.shape[2], bg_tmp_batch.shape[3], bg_tmp_batch.shape[4])
uv_tmp_batch = uv_tmp_batch.reshape(-1, uv_tmp_batch.shape[2], uv_tmp_batch.shape[3], uv_tmp_batch.shape[4])
tex_tmp = tex.unsqueeze(1).repeat(1, uv_tmp_batch.shape[0], 1, 1, 1)
tex_tmp = tex_tmp.reshape(-1, tex_tmp.shape[2], tex_tmp.shape[3], tex_tmp.shape[4])
sample_image = tex_sampler(uv_tmp_batch, tex_tmp)
pred_image = face_unet(sample_image) * (1 - bg_tmp_batch)
pred_img_batch[:, start_idx:end_idx] = pred_image.cpu()
start_idx += batch_size
pred_img_batch = pred_img_batch[0].cpu().detach()
pred_img_batch = torch.flip(pred_img_batch, dims = [1])
os.system("rm {}".format(os.path.join(tmp_path, "*.png")))
for i in range(len(pred_img_batch)):
torchvision.utils.save_image(pred_img_batch[i], "./{}/{}.png".format(tmp_path, i), normalize = True, range = (-1, 1))
os.system("ffmpeg -y -loglevel warning -framerate 25 -start_number 0 -i {}/%d.png -c:v libx264 -pix_fmt yuv420p -b:v 2000k {}/render_{}.mp4".format(tmp_path, tmp_path, idx))
os.system("ffmpeg -y -loglevel warning -i {} -i {} -map 0:v -map 1:a -c:v copy -shortest {}".format(
"{}/render_{}.mp4".format(tmp_path, idx),
audio_path,
"{}/result_{}.mp4".format(tmp_path, idx),
))

os.system("rm {}".format(os.path.join(tmp_path, "*.png")))
os.system("rm {}".format(os.path.join(tmp_path, "test_*")))
os.system("rm {}".format(os.path.join(tmp_path, "render_*")))
os.system("rm {}".format(os.path.join(tmp_path, "tex_no_noise.mp4")))
os.system("rm {}".format(os.path.join(tmp_path, "uv_src.mp4")))
Binary file added example/example.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added example/example.wav
Binary file not shown.
Binary file added example/sty.npy
Binary file not shown.

0 comments on commit 54d45b4

Please sign in to comment.