+ added selection option for alignment format
[qpalma.git] / scripts / createAlignmentFileFromPrediction.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import cPickle
5 import sys
6 import pdb
7 import os
8 import os.path
9 import math
10 import numpy
11
12 import qparser
13
14 def create_alignment_file(current_loc,out_fname):
15 import numpy
16 #qparser.parse_reads(filtered_reads)
17
18 out_fh = open(out_fname,'w+')
19
20 allPredictions = cPickle.load(open(current_loc))
21
22 allPositions = {}
23
24 for current_prediction in allPredictions:
25 id = current_prediction['id']
26
27 #current_ground_truth = qparser.fetch_read(id)
28 #true_cut = current_ground_truth['true_cut']
29 #seq = current_ground_truth['seq']
30 #q1 = current_ground_truth['prb']
31
32 seq = current_prediction['est']
33 dna = current_prediction['dna']
34
35 # CHECK !!!
36 q1 = 'zzz'
37
38 chromo = current_prediction['chr']
39 strand = current_prediction['strand']
40 start_pos = current_prediction['start_pos']
41 alignment = current_prediction['alignment']
42
43 DPScores = current_prediction['DPScores']
44
45 predExons = current_prediction['predExons']
46 predExons = [e+start_pos for e in predExons]
47
48
49 (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
50 tExonSizes,tStarts, tEnds) = alignment
51
52
53 EST2GFF = False
54 if EST2GFF:
55 predExons = numpy.mat(predExons).reshape(len(predExons)/2,2)
56
57 match = predExons[0,1] - predExons[0,0]
58 if predExons.shape == (2,2):
59 match += predExons[1,1] - predExons[1,0]
60
61 mismatch = 0
62 repmatch = 0
63 Ns = 0
64 Qgapcnt = 0
65 Qgapbs = 0
66
67 Tgapcnt = 0
68 Tgapbs = 0
69
70 if predExons.shape == (2,2):
71 Tgapbs += predExons[1,0] - predExons[0,1]
72 Tgapcnt = 1
73
74 # &strand, Qname, &Qsize,
75 # &Qstart, &Qend, Tname, &Tsize,
76 # &Tstart, &Tend, &blockcnt,exonsizes_str,Qstarts_str,Tstarts_str
77
78 #
79 # ATTENTION
80 #
81 # we enforce equal exons sizes for q and t because we are missing indel
82 # positions of the alignment
83
84 if qExonSizes[0] != tExonSizes[0]:
85 continue
86
87 Qname = '%d(%2.2f)'%(id,DPScores.tolist()[0][0])
88 Qsize = len(seq)
89 Qstart = qStart
90 Qend = qEnd
91 Tname = 'CHR%d'%chromo
92
93 start_pos -= 2
94
95 Tsize = tEnd+1 + start_pos
96 Tstart = tStart + start_pos
97 Tend = tEnd + start_pos
98 blockcnt = Tgapcnt+1
99 exonsizes_str = str(tExonSizes)[1:-1].replace(' ','')
100 Qstarts_str = str(qStarts)[1:-1].replace(' ','')
101 Tstarts_str = str(map(lambda x: x+start_pos,tStarts))[1:-1].replace(' ','')
102
103 try:
104 new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
105 (id,chromo,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
106 str(qExonSizes)[1:-1].replace(' ',''),str(qStarts)[1:-1].replace(' ',''),\
107 str(qEnds)[1:-1].replace(' ',''),str(tExonSizes)[1:-1].replace(' ',''),\
108 str(tStarts)[1:-1].replace(' ',''),str(tEnds)[1:-1].replace(' ',''))
109 except:
110 pdb.set_trace()
111
112 #new_line = "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%c\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n"\
113 #% (match, mismatch, repmatch, Ns, Qgapcnt, Qgapbs,\
114 #Tgapcnt, Tgapbs, strand, Qname, Qsize, Qstart, Qend,\
115 #Tname, Tsize, Tstart, Tend,\
116 #blockcnt,exonsizes_str,Qstarts_str,Tstarts_str) #str(predExons))
117
118 out_fh.write(new_line)
119
120 return allPositions
121
122 if __name__ == '__main__':
123 current_dir = sys.argv[1]
124 out_fh = sys.argv[2]
125 create_alignment_file(current_dir,out_fh)