new version

2026-05-10 21:05:24 +02:00
parent 01110c0922
commit 7c86482c81
2 changed files with 57 additions and 5 deletions
--- a/config.json
+++ b/config.json
@@ -4,11 +4,11 @@
  "output_dir": "",
  "video_input_dir": "C:/Users/timoh/Desktop/faceswap/Eingabe",
  "video_output_dir": "C:\\Users\\timoh\\Desktop\\faceswap\\Eingabe\\output_videos",
-  "voice_ref": "C:/Users/timoh/Desktop/Swap/hay.mp3",
-  "voice_source_audio": "",
+  "voice_ref": "C:/Users/timoh/Desktop/faceswap/hay.mp3",
+  "voice_source_audio": "C:/Users/timoh/Desktop/faceswap/The_Little_One_Youve_Always_Been_By_PowerfulListen.wav",
  "voice_output": "C:/Users/timoh/Desktop/faceswap/Ausgabe/hay_cloned.wav",
  "voice_language": "en",
-  "voice_mode": "text",
+  "voice_mode": "audio",
  "enhance": true,
  "color": true,
  "voice_text": "Obsession. It's a fun word, a word that makes your body shake, it makes you feel a little bit strange, but good after all. That obsession comes from me; you are obsessed, addicted, owned, and you love it."
--- a/face_swap.py
+++ b/face_swap.py
@@ -1188,11 +1188,63 @@ class VoiceCloner:

    def clone_from_audio(self, speaker_wav, source_wav, out_file):
        speaker_wav = self._check_audio(speaker_wav, "Referenz-Stimme")
-        source_wav = self._check_audio(source_wav, "Eingabe-Audio")
+        source_wav  = self._check_audio(source_wav,  "Eingabe-Audio")
        self._ensure_vc()
+    
+        import tempfile, math
+        try:
+            import soundfile as sf
+            import numpy as np
+        except ImportError:
+            raise RuntimeError(
+                "Bitte installiere soundfile:\n"
+                "  python -m pip install soundfile"
+            )
+    
        out_path = Path(out_file)
        out_path.parent.mkdir(parents=True, exist_ok=True)
-        self.vc.voice_conversion_to_file(source_wav=source_wav, target_wav=speaker_wav, file_path=str(out_path))
+    
+        # Audiodatei laden und in Segmente aufteilen
+        CHUNK_SEC = 30          # Segmentlänge in Sekunden (bei RAM-Problemen kleiner wählen, z.B. 20)
+        data, sr = sf.read(source_wav, always_2d=False)
+        chunk_samples = int(CHUNK_SEC * sr)
+        total_samples = len(data)
+        num_chunks = math.ceil(total_samples / chunk_samples)
+    
+        if num_chunks <= 1:
+            # Kurze Datei: direkt verarbeiten
+            self.vc.voice_conversion_to_file(
+                source_wav=source_wav, target_wav=speaker_wav, file_path=str(out_path)
+            )
+            return str(out_path)
+    
+        self.log(f"VOICE: Datei zu lang — teile in {num_chunks} Segmente à {CHUNK_SEC}s ...")
+        results = []
+        with tempfile.TemporaryDirectory() as tmpdir:
+            for i in range(num_chunks):
+                start = i * chunk_samples
+                end   = min(start + chunk_samples, total_samples)
+                chunk = data[start:end]
+                chunk_in  = Path(tmpdir) / f"chunk_{i:04d}_in.wav"
+                chunk_out = Path(tmpdir) / f"chunk_{i:04d}_out.wav"
+                sf.write(str(chunk_in), chunk, sr)
+                self.log(f"VOICE: Segment {i+1}/{num_chunks} ...")
+                self.vc.voice_conversion_to_file(
+                    source_wav=str(chunk_in), target_wav=speaker_wav,
+                    file_path=str(chunk_out)
+                )
+                out_data, out_sr = sf.read(str(chunk_out))
+                results.append((out_data, out_sr))
+    
+            # Segmente zusammenführen
+            self.log("VOICE: Füge Segmente zusammen ...")
+            target_sr = results[0][1]
+            merged = np.concatenate(
+                [r if sr == target_sr else r  # ggf. Resampling hier einfügen
+                 for r, sr in results], axis=0
+            )
+            sf.write(str(out_path), merged, target_sr)
+    
        return str(out_path)