Browse Source

add vot scripts and unicode textgrids

Josh Bicking 1 year ago
parent
commit
33b78519f9

+ 5 - 0
README.md

@@ -0,0 +1,5 @@
+This repository contains assorted data from our paper, *Vowel Comparison Between Native English and Russian English Speakers*.
+
+VOT analysis and English TextGrid annotation were done by Josh Bicking. 
+
+Vowel length, formant extraction, and Russian TextGrid annotation were done by Tom Thomas.

BIN
textgrids/english1.TextGrid


BIN
textgrids/english10.TextGrid


BIN
textgrids/english11.TextGrid


BIN
textgrids/english12.TextGrid


BIN
textgrids/english13.TextGrid


BIN
textgrids/english14.TextGrid


BIN
textgrids/english15.TextGrid


BIN
textgrids/english2.TextGrid


BIN
textgrids/english3.TextGrid


BIN
textgrids/english4.TextGrid


BIN
textgrids/english5.TextGrid


BIN
textgrids/english6.TextGrid


BIN
textgrids/english7.TextGrid


BIN
textgrids/english8.TextGrid


BIN
textgrids/english9.TextGrid


BIN
textgrids/russian1.TextGrid


BIN
textgrids/russian10.TextGrid


BIN
textgrids/russian11.TextGrid


BIN
textgrids/russian12.TextGrid


BIN
textgrids/russian13.TextGrid


BIN
textgrids/russian14.TextGrid


BIN
textgrids/russian15.TextGrid


BIN
textgrids/russian2.TextGrid


BIN
textgrids/russian3.TextGrid


BIN
textgrids/russian4.TextGrid


BIN
textgrids/russian5.TextGrid


BIN
textgrids/russian6.TextGrid


BIN
textgrids/russian7.TextGrid


BIN
textgrids/russian8.TextGrid


BIN
textgrids/russian9.TextGrid


+ 99 - 0
vot-palatalization.py

@@ -0,0 +1,99 @@
+# Author: Josh Bicking
+
+from praatio import tgio
+import sys
+from os.path import join
+import numpy
+import subprocess
+
+if len(sys.argv) != 4:
+    print("Usage: {} FOLDER LANG TIER-NAME".format(sys.argv[0]))
+    print("FOLDER contains the 15 annotated voice samples.")
+    print("LANG is the language (the prefix of the sound/TextGrid files, english or russian).")
+    print("TIER-NAME is the name of the ipa tier. Either 'ipa' or 'IPA-phones'.")
+    exit(1)
+
+folder = sys.argv[1]
+langname = sys.argv[2]
+tiername = sys.argv[3]
+
+# list of vowels
+vowels = ["i","y","ɨ","ʉ","ɯ","u","ɪ","ʏ","ɪ̈","ʊ̈",
+          "ʊ","e","ø","ɘ","ɵ","ɤ","o","e̞","ø̞",
+          "ə","ɤ̞","o̞","ɛ","œ","ɜ","ɞ","ʌ","ɔ",
+          "æ","ɐ","a","ɶ","ä","ɑ","ɒ"]
+
+# List of all VOTs discovered
+vot = {}
+
+# Open all Files
+lang = []
+russian = []
+
+for i in range(1,16):
+    lang.append(tgio.openTextgrid(join(folder, "{}{}.TextGrid".format(langname, i))).tierDict[tiername].entryList)
+
+
+# Find all stop consonants with a vowel after them
+stop_consonants = ["t", "d", "k", "g", "p", "b", "ʔ"]
+
+for i in range(0,15):
+    for j in range(0, len(lang[i])):
+        if (lang[i][j].label != ""                      # Not silence
+            and lang[i][j].label[0] in stop_consonants  # Is a stop consonant
+            and j + 1 < len(lang[i])                    # Has an entry after it
+            and lang[i][j+1].label != ""                 # Not silence either
+            and lang[i][j+1].label[0] in vowels):       # Is a vowel or dipthong
+            # Build a script to find where pitch stops
+
+            # We're estimating VOT as "the time between where the
+            # consonant Interval of the TextGrid ends, and the
+            # beginning of pitch (voicing) starts".
+            start = lang[i][j].start
+            end = lang[i][j+1].end
+            zero = lang[i][j].end
+
+            script = [
+                'Read from file: "{}"'.format(join(folder, '{}{}.wav'.format(langname, i+1))),
+                'To Pitch: 0, 75, 600']
+
+            for k in numpy.arange(start, end, .001):
+                script.append('p = Get value at time: {:.3f}, "Hertz", "Linear"'.format(k))
+                script.append('appendInfoLine: p')
+
+            script.append('')
+
+            script = "\n".join(script)
+
+            f = open(join(folder, "tempscript.praat"), "w")
+            f.write(script)
+            f.close()
+
+            s = subprocess.run(["praat", join(folder, "tempscript.praat")], stdout=subprocess.PIPE)
+
+            s = str(s.stdout)[2:].split("\\n")
+            pitch_start = 0
+            for line in s:
+                if line == "--undefined--":
+                    pitch_start += 1
+                else:
+                    break
+
+            if not (pitch_start == 0 or pitch_start == len(s)):
+                # Find where the pitch starts, relative to the sound file
+                results = (start + pitch_start) - zero
+
+                # Check for palatalization
+                if "ʲ" in lang[i][j].label:
+                    consonant = lang[i][j].label[0] + "ʲ"
+                else:
+                    consonant = lang[i][j].label[0]
+                if consonant + lang[i][j+1].label[0] not in vot:
+                    vot[consonant + lang[i][j+1].label[0]] = []
+                vot[consonant + lang[i][j+1].label[0]].append(str(results))
+                if consonant + lang[i][j+1].label[0] not in vot:
+                    vot[consonant + lang[i][j+1].label[0]] = []
+                vot[consonant + lang[i][j+1].label[0]].append(str(results))
+
+for key in sorted(vot):
+    print("{},{},{}".format(key[0], key[1], ",".join(vot[key])))

+ 90 - 0
vot.py

@@ -0,0 +1,90 @@
+# Author: Josh Bicking
+
+from praatio import tgio
+import sys
+from os.path import join
+import numpy
+import subprocess
+
+if len(sys.argv) != 4:
+    print("Usage: {} FOLDER LANG TIER-NAME".format(sys.argv[0]))
+    print("FOLDER contains the 15 annotated voice samples.")
+    print("LANG is the language (the prefix of the sound/TextGrid files, english or russian).")
+    print("TIER-NAME is the name of the ipa tier. Either 'ipa' or 'IPA-phones'.")
+    exit(1)
+
+folder = sys.argv[1]
+langname = sys.argv[2]
+tiername = sys.argv[3]
+
+# list of vowels
+vowels = ["i","y","ɨ","ʉ","ɯ","u","ɪ","ʏ","ɪ̈","ʊ̈",
+          "ʊ","e","ø","ɘ","ɵ","ɤ","o","e̞","ø̞",
+          "ə","ɤ̞","o̞","ɛ","œ","ɜ","ɞ","ʌ","ɔ",
+          "æ","ɐ","a","ɶ","ä","ɑ","ɒ"]
+
+# List of all VOTs discovered
+vot = {}
+
+# Open all Files
+lang = []
+russian = []
+
+for i in range(1,16):
+    lang.append(tgio.openTextgrid(join(folder, "{}{}.TextGrid".format(langname, i))).tierDict[tiername].entryList)
+
+
+# Find all stop consonants with a vowel after them
+stop_consonants = ["t", "d", "k", "g", "p", "b", "ʔ"]
+
+for i in range(0,15):
+    for j in range(0, len(lang[i])):
+        if (lang[i][j].label != ""                      # Not silence
+            and lang[i][j].label[0] in stop_consonants  # Is a stop consonant
+            and j + 1 < len(lang[i])                    # Has an entry after it
+            and lang[i][j+1].label != ""                 # Not silence either
+            and lang[i][j+1].label[0] in vowels):       # Is a vowel or dipthong
+            # Build a script to find where pitch stops
+
+            # We're estimating VOT as "the time between where the
+            # consonant Interval of the TextGrid ends, and the
+            # beginning of pitch (voicing) starts".
+            start = lang[i][j].start
+            end = lang[i][j+1].end
+            zero = lang[i][j].end
+
+            script = [
+                'Read from file: "{}"'.format(join(folder, '{}{}.wav'.format(langname, i+1))),
+                'To Pitch: 0, 75, 600']
+
+            for k in numpy.arange(start, end, .001):
+                script.append('p = Get value at time: {:.3f}, "Hertz", "Linear"'.format(k))
+                script.append('appendInfoLine: p')
+
+            script.append('')
+
+            script = "\n".join(script)
+
+            f = open(join(folder, "tempscript.praat"), "w")
+            f.write(script)
+            f.close()
+
+            s = subprocess.run(["praat", join(folder, "tempscript.praat")], stdout=subprocess.PIPE)
+
+            s = str(s.stdout)[2:].split("\\n")
+            pitch_start = 0
+            for line in s:
+                if line == "--undefined--":
+                    pitch_start += 1
+                else:
+                    break
+
+            if not (pitch_start == 0 or pitch_start == len(s)):
+                # Find where the pitch starts, relative to the sound file
+                results = (start + pitch_start) - zero
+                if lang[i][j].label[0] + lang[i][j+1].label[0] not in vot:
+                    vot[lang[i][j].label[0] + lang[i][j+1].label[0]] = []
+                vot[lang[i][j].label[0] + lang[i][j+1].label[0]].append(str(results))
+
+for key in sorted(vot):
+    print("{},{},{}".format(key[0], key[1], ",".join(vot[key])))