+ added some testcases
[qpalma.git] / scripts / check_dataset.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 #
5 # The purpose of this script is to check the created dataset pickle files for
6 # consistency before doing any kind of training/prediction on the data.
7 #
8
9 import sys
10 import pdb
11 import cPickle
12
13 from qpalma.sequence_utils import get_seq_and_scores,unbracket_seq,reverse_complement
14
15
16 def checkAll(filename):
17 """
18 This function loads the dataset and performs some sanity checks and
19 assertions to be sure that the set is in the right shape for QPalma to train
20 resp. to predict on.
21 """
22
23 dataset = cPickle.load(open(filename))
24
25 # we take the first quality vector of the tuple of quality vectors
26 quality_index = 0
27
28 status = True
29 mes = '---'
30
31 idx = 0
32 for example_key in dataset.keys():
33 matches = dataset[example_key]
34 print 'Current example %d has %d matches' % (example_key,len(matches))
35
36 for example in matches:
37 (currentSeqInfo,original_est,currentQualities) = example
38
39 (id,chromo,strand,genomicSeq_start,genomicSeq_stop) = currentSeqInfo
40
41 assert chromo in range(1,6), pdb.set_trace()
42 assert strand in ['+','-'], pdb.set_trace()
43
44 quality = currentQualities[quality_index]
45
46 # check for key consistency
47 assert id == example_key
48
49 if idx > 0 and idx % 1000 == 0:
50 print 'processed %d examples' % idx
51
52 dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
53
54 genomic_seq_pos,acc_pos,don_pos = get_seq_and_scores(chromo,'+',genomicSeq_start,genomicSeq_stop,dna_flat_files,False)
55 genomic_seq_neg,acc_neg,don_neg = get_seq_and_scores(chromo,'-',genomicSeq_start,genomicSeq_stop,dna_flat_files,False)
56
57 assert reverse_complement(genomic_seq_neg) == genomic_seq_pos
58
59
60 return status,mes
61
62
63 if __name__ == '__main__':
64 dataset_fn = sys.argv[1]
65 status,mes = checkAll(dataset_fn )
66
67 if status == True:
68 print 'Dataset %s seems to be consistent.' % dataset_fn
69 else:
70 print 'Dataset %s seems to be inconsistent!' % dataset_fn
71 print mes