Merge branch 'master' of git.ygdrassil:asr1617data
[asr1617data.git] / segment.py
1 import sys
2 import pympi
3 import random
4 import glob
5 import re
6 import os
7
8 testset = 0.10
9 data = []
10
11
12 def process(num):
13 num = re.match('^.*/(\\d+).TextGrid$', num).group(1)
14 tg = 'textgrid/{:02d}.TextGrid'.format(int(num))
15
16 tgob = pympi.TextGrid(tg)
17 intervalit = tgob.get_tier('lyrics').get_intervals(sort=True)
18
19 with open('mfcc/{:02d}.mfcc'.format(int(num))) as mfcc:
20 (s, e, v) = next(intervalit)
21 currentframe = 0.0
22 for l in mfcc:
23 # Go to next interval
24 if currentframe > e:
25 try:
26 (s, e, v) = next(intervalit)
27 except StopIteration:
28 pass
29
30 label = 1 if v == '' else 0
31 data.append([label] + l.split('\t'))
32
33 # Increase time
34 currentframe += 0.01
35
36
37 if __name__ == '__main__':
38 datafiles = []
39 for fl in glob.glob(os.path.join(os.getcwd(), 'textgrid', '*.TextGrid')):
40 sys.stderr.write('Segment {}\n'.format(fl))
41 process(fl)
42
43 sys.stderr.write('Shuffling {} samples\n'.format(len(data)))
44 random.shuffle(data)
45
46 splitpoint = int(len(data)*testset)
47 testset = data[:splitpoint]
48 trainset = data[splitpoint:]
49 del(data)
50
51 sys.stderr.write('Write testset: {} items\n'.format(splitpoint))
52 with open('test.txt', 'w') as f:
53 for d in testset:
54 f.write('\t'.join(map(str, d)))
55
56 sys.stderr.write('Write trainingset: {:d} items\n'.format(9*splitpoint))
57 with open('train.txt', 'w') as f:
58 for d in trainset:
59 f.write('\t'.join(map(str, d)))
60 f.close()