+ found negative strand bug
[qpalma.git] / tools / run_specific_scripts / transcriptome_analysis / compile_dataset.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import array
5 import cPickle
6 import os
7 import pdb
8 import random
9 import sys
10
11 import qpalma
12 import qpalma.tools
13
14 from qpalma.parsers import *
15 from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,get_flat_file_size,reverse_complement,unbracket_seq,create_bracket_seq,reconstruct_dna_seq
16
17 import QPalmaConfiguration as Conf
18
19 #
20 # This script takes as input the map.vm map_2nd.vm files and generates QPalma
21 # dataset pickle files
22 #
23
24
25 class DatasetGenerator:
26 """
27 This class wraps the dataset generation for the matches of the second vmatch
28 run and the heuristic-filtered matches of the first run.
29
30 An example in our dataset consists of:
31
32 - information on the original read:
33 * id
34 * nucleotide sequence
35 * quality vectors (at the moment: prb and chastity)
36
37 - information on the target dna sequence:
38 * chromosome
39 * strand
40 * fragment start
41 * fragment end positions
42
43 this information is stored in tuples which are stored then in a dictionary
44 indexed by the reads unique id.
45
46 CAUTION: The dictionary stores a list of the above defined tuples under each
47 'id'. This is because one read can be found several times on the genome.
48 """
49
50 def __init__(self,map_file,map_2nd_file):
51 assert os.path.exists(map_file), 'Error: Can not find map file'
52 print map_2nd_file
53 assert os.path.exists(map_2nd_file), 'Error: Can not find map_2nd file'
54 self.map_file = map_file
55 self.map_2nd_file = map_2nd_file
56
57 self.dataset = []
58
59 self.read_size = Conf.read_size
60
61 self.half_window_size = 1500
62
63
64 def saveAs(self,dataset_file):
65 dataset_fn = '%s.pickle'% dataset_file
66 dataset_keys_fn = '%s.keys.pickle'%dataset_file
67
68 print dataset_fn, dataset_keys_fn
69
70 assert not os.path.exists(dataset_fn), 'The data_file already exists!'
71 assert not os.path.exists(dataset_keys_fn), 'The data_keys file already exists!'
72
73 # saving new dataset and single keys as well
74 cPickle.dump(self.dataset,open(dataset_fn,'w+'),protocol=2)
75 cPickle.dump(self.dataset.keys(),open(dataset_keys_fn,'w+'),protocol=2)
76
77
78 def parse_map_file(self,dataset,map_file,first_round):
79 instance_counter = 0
80
81 g_dir = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
82 acc_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
83 don_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
84
85 g_fmt = 'chr%d.dna.flat'
86 s_fmt = 'contig_%d%s'
87
88 num_chromo = 6
89
90 accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
91 seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
92
93 not_consistent_ctr = 0
94
95 for line in open(map_file):
96 if instance_counter > 0 and instance_counter % 5000 == 0:
97 print 'processed %d examples' % instance_counter
98
99 if instance_counter == 10000:
100 print 'Not consistent ctr: %d' % not_consistent_ctr
101 # break
102
103 slist = line.split()
104
105 id = int(slist[0])
106 chromo = int(slist[1])
107 pos = int(slist[2])
108
109 # for the time being we only consider chromosomes 1-5
110 if not chromo in range(1,6):
111 continue
112
113 # convert D/P strand info to +/-
114 if slist[3] == 'D':
115 strand = '+'
116 else:
117 strand = '-'
118
119 # QPalma uses lowercase characters
120 bracket_seq = slist[4].lower()
121 read_seq = unbracket_seq(bracket_seq)
122 reconstructed_dna_seq = reconstruct_dna_seq(bracket_seq)
123
124 if strand == '-' and first_round:
125 read_seq = reverse_complement(read_seq)
126 #reconstructed_dna_seq = reverse_complement(reconstructed_dna_seq)
127
128 assert not '[' in read_seq and not ']' in read_seq
129
130 # we use an area of +/- `self.half_window_size` nucleotides around the seed position
131 if pos > self.half_window_size+1:
132 us_offset = self.half_window_size
133 else:
134 us_offset = pos - 1
135
136 if pos+self.half_window_size < seqInfo.chromo_sizes[chromo]:
137 ds_offset = self.half_window_size
138 else:
139 ds_offset = seqInfo.chromo_sizes[chromo]-pos-1
140
141 genomicSeq_start = pos - us_offset
142 genomicSeq_stop = pos + ds_offset
143
144 dna_seq = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,True)
145
146 #if strand == '-':
147 # dna_part = dna_seq[us_offset-len(read_seq):us_offset]
148 #else:
149 # dna_part = dna_seq[us_offset:us_offset+len(read_seq)]
150
151 #if reconstructed_dna_seq != dna_part:
152 # not_consistent_ctr += 1
153 # pdb.set_trace()
154
155 ## check whether everything is ok
156 # try:
157 # currentDNASeq, currentAcc, currentDon = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
158 # except:
159 # problem_ctr += 1
160 # continue
161
162 # In order to save some space we use a signed char to store the
163 # qualities. Each quality element can range as follows: -128 <= elem <= 127
164 prb = array.array('b',map(lambda x: ord(x)-64,slist[5]))
165
166 # add instance to set
167 currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
168 currentQualities = [prb]
169
170 # As one single read can have several vmatch matches we store all
171 # these matches under the unique id of the read
172 dataset.setdefault(id, []).append((currentSeqInfo,read_seq,currentQualities))
173
174 instance_counter += 1
175
176 print 'Total examples processed: %d' % instance_counter
177 print 'Not consistent ctr: %d' % not_consistent_ctr
178 return dataset
179
180
181 def compile_dataset(self):
182 dataset = {}
183
184 # usually we have two files to parse:
185 # the map file from the second run and a subset of the map file from the
186 # first run
187
188 dataset = self.parse_map_file(dataset,self.map_file,True)
189 dataset = self.parse_map_file(dataset,self.map_2nd_file,False)
190
191 self.dataset = dataset