+++ /dev/null
-import sys
-import pympi
-import random
-import glob
-import re
-import os
-
-testset = 0.10
-data = []
-
-
-def process(num):
- num = re.match('^.*/(\\d+).TextGrid$', num).group(1)
- tg = 'textgrid/{:02d}.TextGrid'.format(int(num))
-
- tgob = pympi.TextGrid(tg)
- intervalit = tgob.get_tier('lyrics').get_intervals(sort=True)
-
- with open('mfcc/{:02d}.mfcc'.format(int(num))) as mfcc:
- (s, e, v) = next(intervalit)
- currentframe = 0.0
- for l in mfcc:
- # Go to next interval
- if currentframe > e:
- try:
- (s, e, v) = next(intervalit)
- except StopIteration:
- pass
-
- label = 1 if v == '' else 0
- data.append([label] + l.split('\t'))
-
- # Increase time
- currentframe += 0.01
-
-
-if __name__ == '__main__':
- datafiles = []
- for fl in glob.glob(os.path.join(os.getcwd(), 'textgrid', '*.TextGrid')):
- sys.stderr.write('Segment {}\n'.format(fl))
- process(fl)
-
- sys.stderr.write('Shuffling {} samples\n'.format(len(data)))
- random.shuffle(data)
-
- splitpoint = int(len(data)*testset)
- testset = data[:splitpoint]
- trainset = data[splitpoint:]
- del(data)
-
- sys.stderr.write('Write testset: {} items\n'.format(splitpoint))
- with open('test.txt', 'w') as f:
- for d in testset:
- f.write('\t'.join(map(str, d)))
-
- sys.stderr.write('Write trainingset: {:d} items\n'.format(9*splitpoint))
- with open('train.txt', 'w') as f:
- for d in trainset:
- f.write('\t'.join(map(str, d)))
- f.close()