Rewrite filelists.py to parse DLG files

This is required for TSL support.
2021-04-14 22:30:46 +07:00 · 2021-04-14 22:30:46 +07:00 · 5a534bfcfa
commit 5a534bfcfa
parent 0b226fce6e
1 changed files with 63 additions and 14 deletions
--- a/scripts/filelists.py
+++ b/scripts/filelists.py
@ -17,31 +17,80 @@ if not os.path.exists(wav_dir):
    raise RuntimeError("WAV directory does not exist")


-def get_lines_from_tlk(obj, speaker):
+def is_suitable_text(text):
+    return not (text.startswith("[") and text.endswith("]"))
+
+
+def index_or_negative_one(string, substr, beg=0):
+    try:
+        return string.index(substr, beg)
+    except ValueError:
+        return -1
+
+
+def erase_brackets(string, left, right):
+    left_idx = index_or_negative_one(string, left)
+    if left_idx == -1:
+        return (string, False)
+
+    right_idx = index_or_negative_one(string, right, left_idx + 1)
+    if right_idx == -1:
+        return (string, False)
+
+    return (string[:left_idx] + string[(right_idx + 1):], True)
+
+
+def clear_text(text):
+    while True:
+        text, _ = erase_brackets(text, "[", "]")
+        text, found = erase_brackets(text, "{", "}")
+        if not found:
+            break
+
+    return text.strip()
+
+
+def get_lines_from_dlg(obj, speaker, tlk_strings):
    lines = []
-    if "strings" in obj:
+    if "EntryList|15" in obj:
        uniq_sound = set()
-        for string in obj["strings"]:
-            if ("soundResRef" in string) and (speaker in string["soundResRef"]):
-                soundresref = string["soundResRef"].lower()
-                text = string["text"]
-                if soundresref.startswith("n") and (not (text.startswith("[") and text.endswith("]"))) and (not soundresref in uniq_sound):
-                    wav_filename = os.path.join(wav_dir, soundresref + ".wav")
-                    if os.path.exists(wav_filename):
-                        lines.append("{}|{}|0\n".format(wav_filename, text))
-                        uniq_sound.add(soundresref)
+        for entry in obj["EntryList|15"]:
+            if "VO_ResRef|11" in entry:
+                voresref = entry["VO_ResRef|11"].lower()
+                textstrref = int(entry["Text|12"].split("|")[0])
+                if textstrref != -1:
+                    text = tlk_strings[textstrref][1]
+                    if voresref and (not voresref in uniq_sound) and is_suitable_text(text):
+                        wav_filename = os.path.join(wav_dir, voresref + ".wav")
+                        if os.path.exists(wav_filename):
+                            lines.append("{}|{}|0\n".format(wav_filename, clear_text(text)))
+                            uniq_sound.add(voresref)

    return lines


 def generate_filelist(extract_dir, speaker):
-    # Extract lines from all TLK files
+    tlk_strings = dict()
+
+    # Read strings from dialog.tlk into dictionary
+    tlk_path = os.path.join(extract_dir, "dialog.tlk.json")
+    if os.path.exists(tlk_path):
+        with open(tlk_path, "r") as fp:
+            obj = json.load(fp)
+            if "strings" in obj:
+                for string in obj["strings"]:
+                    strref = int(string["_index"])
+                    soundresref = string["soundResRef"].lower()
+                    text = string["text"]
+                    tlk_strings[strref] = (soundresref, text)
+
+    # Extract lines from DLG files
    lines = []
    for f in glob.glob("{}/**".format(extract_dir), recursive=True):
-        if f.endswith(".tlk.json"):
+        if f.endswith(".dlg.json"):
            with open(f, "r") as fp:
                obj = json.load(fp)
-                lines.extend(get_lines_from_tlk(obj, speaker))
+                lines.extend(get_lines_from_dlg(obj, speaker, tlk_strings))

    # Split lines into training and validation filelists
    random.shuffle(lines)