+ training works,
[qpalma.git] / scripts / check_dataset.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import pdb
6 import cPickle
7
8 from compile_dataset import compile_d,get_seq_and_scores,get_seq
9 from qpalma_main import unbracket_est
10
11 def checkAll(filename):
12 dataset = cPickle.load(open(filename))
13 [SeqInfo, Exons, OriginalEsts, Qualities, AlternativeSequences] = dataset
14
15 for idx in range(len(SeqInfo)):
16 if idx > 0 and idx % 250 == 0:
17 print 'processed %d examples' % idx
18
19 currentInfo = SeqInfo[idx]
20 chr,strand,p1,p2 = currentInfo
21 currentExon = Exons[idx]
22 currentExon[0,1] += 1
23 currentExon -= 1
24
25 currentEst = OriginalEsts[idx]
26 originalEst = OriginalEsts[idx]
27
28 if currentEst.find('[') != -1:
29 #print currentEst
30 currentEst = unbracket_est(currentEst)
31 #print currentEst
32
33 assert len(currentEst) == 36, pdb.set_trace()
34
35 first_seq = get_seq( currentExon[0,0], currentExon[0,1], True )
36 end = first_seq[-2:]
37 first_seq = first_seq[:-2]
38 second_seq = get_seq( currentExon[1,0], currentExon[1,1]+1, False )
39 begin = second_seq[:2]
40 second_seq = second_seq[2:]
41
42 dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
43 seq, acc, don =\
44 get_seq_and_scores(chr,strand,currentExon[0,0],currentExon[1,1]+1,dna_flat_files)
45
46 assert (len(first_seq) + len(second_seq)) == 36, pdb.set_trace()
47
48 if not (end == 'gt' or end == 'gc'):
49 print 'invalid donor in example %d'% idx
50 print SeqInfo[idx]
51 print currentExon
52
53 #invalid_donor_ctr += 1
54 #continue
55
56 if not (begin == 'ag'):
57 print 'invalid acceptor in example %d'% idx
58 print SeqInfo[idx]
59 print currentExon
60
61 pdb.set_trace()
62
63
64 if __name__ == '__main__':
65 checkAll(sys.argv[1])