+ fixed nasty index bugs in the when creating the label of the ground truth
[qpalma.git] / scripts / check_dataset.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import pdb
6 import cPickle
7
8 from compile_dataset import compile_d,get_seq_and_scores,get_seq
9 from qpalma_main import unbracket_est
10
11 def checkAll(filename):
12 dataset = cPickle.load(open(filename))
13 [SeqInfo, Exons, OriginalEsts, Qualities, AlternativeSequences] = dataset
14
15 for idx in range(len(SeqInfo)):
16 if idx > 0 and idx % 250 == 0:
17 print 'processed %d examples' % idx
18
19 currentInfo = SeqInfo[idx]
20 chr,strand,p1,p2 = currentInfo
21 currentExon = Exons[idx]
22 currentEst = OriginalEsts[idx]
23 originalEst = OriginalEsts[idx]
24
25 if currentEst.find('[') != -1:
26 #print currentEst
27 currentEst = unbracket_est(currentEst)
28 #print currentEst
29
30 assert len(currentEst) == 36, pdb.set_trace()
31
32 first_seq = get_seq( currentExon[0,0], currentExon[0,1], True )
33 end = first_seq[-2:]
34 first_seq = first_seq[:-2]
35 second_seq = get_seq( currentExon[1,0], currentExon[1,1]+1, False )
36 begin = second_seq[:2]
37 second_seq = second_seq[2:]
38
39 dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
40 seq, acc, don =\
41 get_seq_and_scores(chr,strand,currentExon[0,0],currentExon[1,1]+1,dna_flat_files)
42
43 assert (len(first_seq) + len(second_seq)) == 36, pdb.set_trace()
44
45 if not (end == 'gt' or end == 'gc'):
46 print 'invalid donor in example %d'% idx
47 print SeqInfo[idx]
48 print currentExon
49
50 #invalid_donor_ctr += 1
51 #continue
52
53 if not (begin == 'ag'):
54 print 'invalid acceptor in example %d'% idx
55 print SeqInfo[idx]
56 print currentExon
57
58 pdb.set_trace()
59
60
61 if __name__ == '__main__':
62 checkAll(sys.argv[1])