5454ad0f2995da9fb0d5af9fbf48fed5bd19909c
[asr1617data.git] / segment.py
1 import sys
2 import pympi
3 import random
4
5 testset = 0.10
6 data = []
7
8
9 def process(num, *args):
10 tg = 'textgrid/{:02d}.TextGrid'.format(int(num))
11
12 tgob = pympi.TextGrid(tg)
13 intervalit = tgob.get_tier('lyrics').get_intervals(sort=True)
14
15 with open('mfcc/{:02d}.mfcc'.format(int(num))) as mfcc:
16 (s, e, v) = next(intervalit)
17 currentframe = 0.0
18 for l in mfcc:
19 # Go to next interval
20 if currentframe > e:
21 try:
22 (s, e, v) = next(intervalit)
23 except StopIteration:
24 pass
25
26 label = 1 if v == '' else 0
27 data.append([label] + l.split('\t'))
28
29 # Increase time
30 currentframe += 0.025
31
32
33 if __name__ == '__main__':
34 datafiles = []
35 with open('data.txt', 'r') as f:
36 for l in f:
37 s = l.strip().split('\t')
38 sys.stderr.write('Processing {}: {}\n'.format(s[0], s[1]))
39 process(*s)
40
41 sys.stderr.write('Shuffling {} samples\n'.format(len(data)))
42 random.shuffle(data)
43
44 splitpoint = int(len(data)*testset)
45 testset = data[:splitpoint]
46 trainset = data[splitpoint:]
47 del(data)
48
49 sys.stderr.write('Write testset: {} items\n'.format(splitpoint))
50 with open('test.txt', 'w') as f:
51 for d in testset:
52 f.write('\t'.join(map(str, d)))
53
54 sys.stderr.write('Write trainingset: {:d} items\n'.format(9*splitpoint))
55 with open('train.txt', 'w') as f:
56 for d in trainset:
57 f.write('\t'.join(map(str, d)))
58 f.close()