+ changed output format of alignment file script to be compatible with est2gff
[qpalma.git] / scripts / createAlignmentFileFromPrediction.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import cPickle
5 import sys
6 import pdb
7 import os
8 import os.path
9 import math
10 import numpy
11
12 #from qpalma.parsers import *
13
14 from Evaluation import load_chunks
15 import qparser
16
17
18 def create_aglignment_file(allPredictions,out_fname):
19 import numpy
20 #qparser.parse_reads(filtered_reads)
21
22 out_fh = open(out_fname,'w+')
23
24 #allPredictionsmllPredictions = load_chunks(current_loc)
25 #allPredictions = cPickle.load(open(current_loc))
26
27 allPositions = {}
28
29 for current_prediction in allPredictions:
30 id = current_prediction['id']
31
32 #current_ground_truth = qparser.fetch_read(id)
33 #true_cut = current_ground_truth['true_cut']
34 #seq = current_ground_truth['seq']
35 #q1 = current_ground_truth['prb']
36
37 seq = current_prediction['est']
38 dna = current_prediction['dna']
39
40 # CHECK !!!
41 q1 = 'zzz'
42
43 chromo = current_prediction['chr']
44 strand = current_prediction['strand']
45 start_pos = current_prediction['start_pos']
46 alignment = current_prediction['alignment']
47
48 predExons = current_prediction['predExons']
49 predExons = [e+start_pos for e in predExons]
50
51 #p_start = current_ground_truth['p_start']
52 #e_stop = current_ground_truth['exon_stop']
53 #e_start = current_ground_truth['exon_start']
54 #p_stop = current_ground_truth['p_stop']
55
56 # &match, &mismatch, &repmatch, &Ns, &Qgapcnt, &Qgapbs, &Tgapcnt, &Tgapbs,
57 predExons = numpy.mat(predExons).reshape(len(predExons)/2,2)
58 #predExons = predExons.T
59
60 #print predExons
61 #pdb.set_trace()
62
63 match = predExons[0,1] - predExons[0,0]
64 if predExons.shape == (2,2):
65 match += predExons[1,1] - predExons[1,0]
66
67 mismatch = 0
68 repmatch = 0
69 Ns = 0
70 Qgapcnt = 0
71 Qgapbs = 0
72
73 Tgapcnt = 0
74 Tgapbs = 0
75
76 if predExons.shape == (2,2):
77 Tgapbs += predExons[1,0] - predExons[0,1]
78 Tgapcnt = 1
79
80 # &strand, Qname, &Qsize,
81
82 # &Qstart, &Qend, Tname, &Tsize,
83 # &Tstart, &Tend, &blockcnt,exonsizes_str,Qstarts_str,Tstarts_str
84
85 (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
86 tExonSizes,tStarts, tEnds) = alignment
87
88 #
89 # ATTENTION
90 #
91 # we enforce equal exons sizes for q and t because we are missing indel
92 # positions of the alignment
93
94 if qExonSizes[0] != tExonSizes[0]:
95 continue
96
97 Qname = str(id)
98 Qsize = len(seq)
99 Qstart = qStart
100 Qend = qEnd
101 Tname = 'CHR%d'%chromo
102
103 start_pos -= 2
104
105 Tsize = tEnd+1 + start_pos
106 Tstart = tStart + start_pos
107 Tend = tEnd + start_pos
108 blockcnt = Tgapcnt+1
109 exonsizes_str = str(tExonSizes)[1:-1].replace(' ','')
110 Qstarts_str = str(qStarts)[1:-1].replace(' ','')
111 Tstarts_str = str(map(lambda x: x+start_pos,tStarts))[1:-1].replace(' ','')
112
113 #new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
114 #(id,chr,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
115 #str(qExonSizes)[1:-1].replace(' ',''),str(qStarts)[1:-1].replace(' ',''),\
116 #str(qEnds)[1:-1].replace(' ',''),str(tExonSizes)[1:-1].replace(' ',''),\
117 #str(tStarts)[1:-1].replace(' ',''),str(tEnds)[1:-1].replace(' ',''))
118
119 new_line = "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%c\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n"\
120 % (match, mismatch, repmatch, Ns, Qgapcnt, Qgapbs,\
121 Tgapcnt, Tgapbs, strand, Qname, Qsize, Qstart, Qend,\
122 Tname, Tsize, Tstart, Tend,\
123 blockcnt,exonsizes_str,Qstarts_str,Tstarts_str) #str(predExons))
124
125
126 out_fh.write(new_line)
127
128 return allPositions
129
130 if __name__ == '__main__':
131 current_dir = sys.argv[1]
132 out_fh = sys.argv[2]
133 create_aglignment_file(current_dir,out_fh)