+ fixed a bug in the C++ interface
[qpalma.git] / standalone / palma / output_formating.py
1 #
2 # This program is free software; you can redistribute it and/or modify
3 # it under the terms of the GNU General Public License as published by
4 # the Free Software Foundation; either version 2 of the License, or
5 # (at your option) any later version.
6 #
7 # Written (W) 2006-2007 Uta Schulze, Gunnar Raetsch
8 # Copyright (C) 2006-2007 Max-Planck-Society
9 #
10
11 import pdb
12
13 def print_line(score, match, mism, qGapb, tGapb, strand, tStrand, qName, qSize, qStart, qEnd,\
14 tName, tSize, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
15 tExonSizes, tStarts, tEnds, exon_length, identity, out_file):
16 """Produces a tab separated line in the out_file which describes the alignment.
17
18 1. align. score: palma alignment score
19 2. match: number of matches in the local alignment
20 3. mismatch: number of mismatches in the local alignment
21 4. Q gap bases: number of gaps on the target sequence (insert in query acc. to BLAT)
22 5. T gap bases: number of gaps on the query sequence (insert in target acc. to BLAT)
23 (introns, which are basically gaps on the query, are not included!)
24 6. strand: '+' or '-' for query strand (second strand for target if provided)
25 if you align two "short" sequences and no whole genome search, the
26 strand equals "+" by default and logic.
27 7. Q name: first word in header of the entry in the query-file.
28 8. Q size: length of the sequence of the entry in the query-file
29 9. Q start: alignment start position on query sequence
30 10. Q end: alignment end position on query sequence
31 11. T name: first word in header of the entry in the target-file.
32 12. T size: length of the sequence of the entry in the target-file
33 13. T start: alignment start position on target sequence
34 14. T end: alignment end position on target sequence
35 15. exon count: number of exons in the alignment. Different to BLAT, where each block is
36 defined by the surrounding gaps, exons are defined by adjacent introns.
37 Exons can contain gaps. The length of the without gaps on both target
38 and query will most likely be different.
39 16. Q bound: Exact beginning and end of each exon on the query, without showing the gaps.
40 E.g. "ex1_start-ex1_end,ex2_start-ex2_end,..,"
41 Beginning of first exon most likely equals Q start, except the alignment
42 begins with an intron. Analog with Q end for the end of the last exon.
43 17. T bound: Exact beginning and end of each exon on the query, without showing the gaps.
44 Beginning of first exon most likely equals T start, except the alignment
45 begins with an intron. Analog with T end for the end of the last exon.
46 18. exon lengths: These are actually the length of the exons in the alignment, where gaps are
47 included.
48 19. identity: Number of matches divided by length of exon on alignment times 100.
49 If the exon is perfect, and there are no gaps or mismatches, it's 100%.
50 """
51
52 qExonSizes_string = ''
53 for i in qExonSizes:
54 qExonSizes_string = qExonSizes_string + ('%d,' %i)
55 tExonSizes_string = ''
56 for i in tExonSizes:
57 tExonSizes_string = tExonSizes_string + ('%d,' %i)
58 qBounds_string = ''
59 for (i,j) in zip(qStarts, qEnds):
60 qBounds_string = qBounds_string + ('%d-%d,' %(i,j))
61 tBounds_string = ''
62 for (i,j) in zip(tStarts, tEnds):
63 tBounds_string = tBounds_string + ('%d-%d,' %(i,j))
64 exon_length_string = ''
65 for i in exon_length:
66 exon_length_string = exon_length_string + ('%d,' %i)
67 identity_string = ''
68 for i in identity:
69 identity_string = identity_string + ('%1.1f,' %(i*100))
70
71 if tStrand == '+':
72 out_file.write("%4.3f\t%d\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\n"\
73 %(score, match, mism, qGapb, tGapb, strand, qName, qSize, qStart,\
74 qEnd, tName, tSize, tStart, tEnd, num_exons, qBounds_string,\
75 tBounds_string, exon_length_string, identity_string))
76 if tStrand == '-':
77 out_file.write("%4.3f\t%d\t%d\t%d\t%d\t%s%s\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\n"\
78 %(score, match, mism, qGapb, tGapb, strand, tStrand, qName, qSize, qStart,\
79 qEnd, tName, tSize, tStart, tEnd, num_exons, qBounds_string,\
80 tBounds_string, exon_length_string, identity_string))
81 return
82 #<------------------------------------------------------------------------------>
83
84 #<------------------------------------------------------------------------------>
85 # prints a line in the output_file which describes the alignment, in blat psl format
86 #<------------------------------------------------------------------------------>
87 def print_line_blatformat(score, match, mism, qGapb, tGapb, qStrand, tStrand, qName, qSize, qStart, qEnd,\
88 tName, tSize, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
89 tExonSizes, tStarts, tEnds, exon_length, identity, ssQuality, out_file):
90 """
91 matches int unsigned , # Number of bases that match (in BLAT this excludes repeats, not here)
92 misMatches int unsigned , # Number of bases that don't match
93 repMatches int unsigned , # Number of bases that match but are part of repeats (always zero here)
94 nCount int unsigned , # Number of 'N' bases (always zero here)
95 qNumInsert int unsigned , # Number of inserts in query (length of tStarts)
96 qBaseInsert int unsigned , # Number of bases inserted in query
97 tNumInsert int unsigned , # Number of inserts in target (length of qStarts)
98 tBaseInsert int unsigned , # Number of bases inserted in target
99 strand char(2) , # + or - for query strand, optionally followed by + or - for target strand
100 qName varchar(255) , # Query sequence name
101 qSize int unsigned , # Query sequence size
102 qStart int unsigned , # Alignment start position in query
103 qEnd int unsigned , # Alignment end position in query
104 tName varchar(255) , # Target sequence name
105 tSize int unsigned , # Target sequence size
106 tStart int unsigned , # Alignment start position in target
107 tEnd int unsigned , # Alignment end position in target
108 blockCount int unsigned , # Number of blocks in alignment. A block contains no gaps.
109 blockSizes longblob , # Size of each block in a comma separated list
110 qStarts longblob , # Start of each block in query in a comma separated list
111 tStarts longblob , # Start of each block in target in a comma separated list
112 """
113
114 if tStrand == '-':
115 exon_length.reverse()
116 qEnds.reverse()
117 qStarts = map(lambda x: qSize - x + 1, qEnds)
118 tStarts.reverse()
119 tEnds.reverse()
120 #if strand == '-':
121 # qStarts.reverse()
122 # qEnds.reverse()
123 if True:
124 for i in range(len(qStarts)):
125 if i>0: qStarts[i]=qStarts[i-1]+exon_length[i-1]
126
127 repMatches = 0
128 nCount = 0
129 qStarts_string = ''
130 for i in qStarts:
131 assert(i>0)
132 qStarts_string = qStarts_string + ('%d,' %(i-1))
133 tStarts_string = ''
134 #if len(tStarts)>0:
135 # tStarts_string = tStarts_string + ('%d,' %(tStarts[0]))
136 for i in tStarts:
137 assert(i>0)
138 tStarts_string = tStarts_string + ('%d,' %(i-1))
139 exon_length_string = ''
140 for i in exon_length:
141 exon_length_string = exon_length_string + ('%d,' %i)
142 ssQuality_string = ''
143 for i in ssQuality:
144 ssQuality_string = ssQuality_string + ('%1.1f,' %(i*100))
145 identity_string = ''
146 for i in identity:
147 identity_string = identity_string + ('%1.1f,' %(i*100))
148
149 out_file.write('%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t'\
150 %(match, mism, repMatches, nCount, len(tStarts), qGapb, len(qStarts), tGapb))
151 #out_file.write('%c%c\t' %(tStrand,qStrand))
152 out_file.write('%c\t' %(tStrand))
153
154 out_file.write('%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\n'\
155 %(qName, qSize, qStart-1, qEnd, tName, tSize, tStart-1, tEnd, \
156 num_exons, exon_length_string, qStarts_string, tStarts_string, identity_string, ssQuality_string))
157
158 return
159 #<------------------------------------------------------------------------------>
160
161
162 #<------------------------------------------------------------------------------>
163 # prints the alignment, should be clear (hehe)
164 # len(DNA) == len(EST)
165 #<------------------------------------------------------------------------------>
166 def print_align(DNA, EST, comparison, qIDX, tIDX, out_file, width=60):
167 import numpy
168 idx = []
169 idx_old = []
170 for i in range(1,len(DNA),width): #e.g. 1, 61, 121 ...
171 if i == 1:
172 idx = [i,numpy.minimum(i+width-1,len(DNA))]
173 out_file.write("Q: %9i %s %9i\n" %(qIDX[idx[0]-1], EST[idx[0]-1:idx[1]], qIDX[idx[1]-1]))
174 out_file.write(" %s \n" %(comparison[idx[0]-1:idx[1]]))
175 out_file.write("T: %9i %s %9i\n\n\n" %(tIDX[idx[0]-1], DNA[idx[0]-1:idx[1]], tIDX[idx[1]-1]))
176 else:
177 idx_old = idx
178 idx = [i,numpy.minimum(i+width-1,len(DNA))]
179 out_file.write("Q: %9i %s %9i\n" %(min(qIDX[idx_old[1]-1]+1,qIDX[idx[1]-1]), EST[idx[0]-1:idx[1]], qIDX[idx[1]-1]))
180 out_file.write(" %s \n" %(comparison[idx[0]-1:idx[1]]))
181 out_file.write("T: %9i %s %9i\n\n\n" %(min(tIDX[idx_old[1]-1]+1,tIDX[idx[1]-1]), DNA[idx[0]-1:idx[1]], tIDX[idx[1]-1]))
182 return
183
184
185 def print_header(out_file):
186 """prints some lines in output_file that describe what each column means
187
188 Can be turned off via --noHead.
189 """
190 out_file.write("align.\tmatch\tmis-\tQ gap\tT gap\tstrand\tQ\tQ\tQ\tQ\tT\tT\tT\tT\texon\tQ\tT\texon\tidentity\n")
191 out_file.write("score\t\tmatch\tbases\tbases\t\tname\tsize\tstart\tend\tname\tsize\tstart\tend\tcount\tbound\tbound\tlengths\n")
192 out_file.write("--------------------------------------------------------------------------------------------------------------------------------------------------------\n")
193 return
194
195 def print_blat_header(out_file):
196 """prints blat header lines in output_file that describe what each column means
197
198 Copied from blat output. Can be turned off via --noHead.
199 """
200 out_file.write('psLayout version 3\n\n')
201 out_file.write("match\tmis-\trep.\tN's\tQ gap\tQ gap\tT gap\tT gap\tstrand\tQ\t\tQ\tQ\tQ\tT\t\tT\tT\tT\tblock\tblockSizes\tqStarts\t tStarts\n")
202 out_file.write('\tmatch\tmatch\t\tcount\tbases\tcount\tbases\t\tname\t\tsize\tstart\tend\tname\t\tsize\tstart\tend\tcount\n')
203 out_file.write('---------------------------------------------------------------------------------------------------------------------------------------------------------------\n')
204
205 def print_bed_header(bed_file):
206 """prints some lines in bed_file that describe what each column means
207
208 Can be turned off via --noHead.
209 """
210 bed_file.write("chrom\tchrom\tchrom\tname\tscore\tstrand\tblock\tblock\tblock\n")
211 bed_file.write("\tStart\tEnd\t\t\t\tCount\tSizes\tStart\n")
212 bed_file.write("---------------------------------------------------------------------\n")
213 return
214
215
216
217 #<------------------------------------------------------------------------------>
218 # prints a line in the output bed_file which describes the alignment
219 #<------------------------------------------------------------------------------>
220 def print_bed_line(tName, tStart, tEnd, qName, score, tStrand,\
221 num_exons, tStarts, tEnds, bed_file):
222
223 tStarts_string = ''
224 tExonSizes_string = ''
225 for (i,j) in zip(tStarts, tEnds):
226 tStarts_string = tStarts_string + ('%d,' %(i))
227 tExonSizes_string = tExonSizes_string + ('%d,' %(j-i+1))
228
229 bed_file.write("%s\t%d\t%d\t%s\t%4.3f\t%s\t%d\t%s\t%s\n"\
230 %(tName, tStart, tEnd, qName, score, tStrand,\
231 num_exons, tExonSizes_string, tStarts_string))
232 return
233 #<------------------------------------------------------------------------------>
234
235
236 def print_results(align_info, out_file, bed_file, tName, tSize, qName, qSize, qStrand, tStrand, parameters):
237 """printing results in output_file"""
238
239 score = align_info.score
240 match = align_info.match
241 mism = align_info.mism
242 qGapb = align_info.qGapb
243 tGapb = align_info.tGapb
244
245 tStart = align_info.tStart
246 tEnd = align_info.tEnd
247 tStarts = align_info.tStarts
248 tEnds = align_info.tEnds
249 tIDX = align_info.tIDX
250
251 qStart = align_info.qStart
252 qEnd = align_info.qEnd
253 qStarts = align_info.qStarts
254 qEnds = align_info.qEnds
255 qIDX = align_info.qIDX
256 #if tStrand=='+':
257 # tStart = align_info.tStart
258 # tEnd = align_info.tEnd
259 # tStarts = align_info.tStarts
260 # tEnds = align_info.tEnds
261 # tIDX = align_info.tIDX
262 #else:
263 # tEnd = tSize - align_info.tStart + 1
264 # tStart = tSize - align_info.tEnd + 1
265 # tStarts = map(lambda x: tSize - x + 1, align_info.tStarts)
266 # tEnds = map(lambda x: tSize - x + 1, align_info.tEnds)
267 # tIDX = map(lambda x: tSize - x + 1, align_info.tIDX)
268
269 #if qStrand=='+':
270 # qStart = align_info.qStart
271 # qEnd = align_info.qEnd
272 # qStarts = align_info.qStarts
273 # qEnds = align_info.qEnds
274 # qIDX = align_info.qIDX
275 #else:
276 # qEnd = qSize - align_info.qStart + 1
277 # qStart = qSize - align_info.qEnd + 1
278 # qStarts = map(lambda x: qSize - x + 1, align_info.qStarts)
279 # qEnds = map(lambda x: qSize - x + 1, align_info.qEnds)
280 # qIDX = map(lambda x: qSize - x + 1, align_info.qIDX)
281
282 num_exons = align_info.num_exons
283 qExonSizes = align_info.qExonSizes
284 tExonSizes = align_info.tExonSizes
285 exon_length = align_info.exon_length
286 identity = align_info.identity
287 ssQuality = align_info.ssQuality
288 exonQuality = align_info.exonQuality
289 comparison = align_info.comparison
290 dna_letters = align_info.dna_letters
291 est_letters = align_info.est_letters
292
293 if parameters['print_psl2']:
294 print_line(score, match, mism, qGapb, tGapb, qStrand, tStrand, qName, qSize, qStart, qEnd,\
295 tName, tSize, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
296 tExonSizes, tStarts, tEnds, exon_length, identity, ssQuality, out_file)
297 else:
298 print_line_blatformat(score, match, mism, qGapb, tGapb, qStrand, tStrand, qName, qSize, qStart, qEnd,\
299 tName, tSize, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
300 tExonSizes, tStarts, tEnds, exon_length, exonQuality, ssQuality, out_file)
301
302 if parameters['print_sequences']:
303 out_file.write("\n")
304 print_align(dna_letters[0], est_letters[0], comparison, qIDX, tIDX, out_file)
305
306 if print_header and bed_file is not None:
307 print_bed_header(bed_file)
308 print_bed_line(tName, tStart, tEnd, qName, score, tStrand, num_exons, tStarts, tEnds, bed_file)
309