From 7c86482c81766970f7ddb470b9abbb685308f807 Mon Sep 17 00:00:00 2001 From: TutorialsGHG <65071223+TutorialsGHG@users.noreply.github.com> Date: Sun, 10 May 2026 21:05:24 +0200 Subject: [PATCH] new version --- config.json | 6 +++--- face_swap.py | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 57 insertions(+), 5 deletions(-) diff --git a/config.json b/config.json index e2ae946..2034257 100644 --- a/config.json +++ b/config.json @@ -4,11 +4,11 @@ "output_dir": "", "video_input_dir": "C:/Users/timoh/Desktop/faceswap/Eingabe", "video_output_dir": "C:\\Users\\timoh\\Desktop\\faceswap\\Eingabe\\output_videos", - "voice_ref": "C:/Users/timoh/Desktop/Swap/hay.mp3", - "voice_source_audio": "", + "voice_ref": "C:/Users/timoh/Desktop/faceswap/hay.mp3", + "voice_source_audio": "C:/Users/timoh/Desktop/faceswap/The_Little_One_Youve_Always_Been_By_PowerfulListen.wav", "voice_output": "C:/Users/timoh/Desktop/faceswap/Ausgabe/hay_cloned.wav", "voice_language": "en", - "voice_mode": "text", + "voice_mode": "audio", "enhance": true, "color": true, "voice_text": "Obsession. It's a fun word, a word that makes your body shake, it makes you feel a little bit strange, but good after all. That obsession comes from me; you are obsessed, addicted, owned, and you love it." diff --git a/face_swap.py b/face_swap.py index 790acbe..2c920c3 100644 --- a/face_swap.py +++ b/face_swap.py @@ -1188,11 +1188,63 @@ class VoiceCloner: def clone_from_audio(self, speaker_wav, source_wav, out_file): speaker_wav = self._check_audio(speaker_wav, "Referenz-Stimme") - source_wav = self._check_audio(source_wav, "Eingabe-Audio") + source_wav = self._check_audio(source_wav, "Eingabe-Audio") self._ensure_vc() + + import tempfile, math + try: + import soundfile as sf + import numpy as np + except ImportError: + raise RuntimeError( + "Bitte installiere soundfile:\n" + " python -m pip install soundfile" + ) + out_path = Path(out_file) out_path.parent.mkdir(parents=True, exist_ok=True) - self.vc.voice_conversion_to_file(source_wav=source_wav, target_wav=speaker_wav, file_path=str(out_path)) + + # Audiodatei laden und in Segmente aufteilen + CHUNK_SEC = 30 # Segmentlänge in Sekunden (bei RAM-Problemen kleiner wählen, z.B. 20) + data, sr = sf.read(source_wav, always_2d=False) + chunk_samples = int(CHUNK_SEC * sr) + total_samples = len(data) + num_chunks = math.ceil(total_samples / chunk_samples) + + if num_chunks <= 1: + # Kurze Datei: direkt verarbeiten + self.vc.voice_conversion_to_file( + source_wav=source_wav, target_wav=speaker_wav, file_path=str(out_path) + ) + return str(out_path) + + self.log(f"VOICE: Datei zu lang — teile in {num_chunks} Segmente à {CHUNK_SEC}s ...") + results = [] + with tempfile.TemporaryDirectory() as tmpdir: + for i in range(num_chunks): + start = i * chunk_samples + end = min(start + chunk_samples, total_samples) + chunk = data[start:end] + chunk_in = Path(tmpdir) / f"chunk_{i:04d}_in.wav" + chunk_out = Path(tmpdir) / f"chunk_{i:04d}_out.wav" + sf.write(str(chunk_in), chunk, sr) + self.log(f"VOICE: Segment {i+1}/{num_chunks} ...") + self.vc.voice_conversion_to_file( + source_wav=str(chunk_in), target_wav=speaker_wav, + file_path=str(chunk_out) + ) + out_data, out_sr = sf.read(str(chunk_out)) + results.append((out_data, out_sr)) + + # Segmente zusammenführen + self.log("VOICE: Füge Segmente zusammen ...") + target_sr = results[0][1] + merged = np.concatenate( + [r if sr == target_sr else r # ggf. Resampling hier einfügen + for r, sr in results], axis=0 + ) + sf.write(str(out_path), merged, target_sr) + return str(out_path)