--- /dev/null
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# Written (W) 2008 Fabio De Bona
+# Copyright (C) 2008 Max-Planck-Society
+
+import cPickle
+import math
+import numpy
+import os
+import os.path
+import pdb
+import sys
+
+from qpalma.utils import pprint_alignment
+from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper
+
+
+def alignment_reconstruct(current_prediction,num_exons):
+ """
+ We reconstruct the exact alignment to infer the positions of indels and the
+ sizes of the respective exons.
+ """
+ translation = '-acgtn_' # how aligned est and dna sequence is displayed (gap_char, 5 nucleotides, intron_char)
+
+ SpliceAlign = current_prediction['spliceAlign']
+ estAlign = current_prediction['estAlign']
+
+ dna_array = current_prediction['dna_array']
+ read_array = current_prediction['read_array']
+
+ dna_array = "".join(map(lambda x: translation[int(x)],dna_array))
+ read_array = "".join(map(lambda x: translation[int(x)],read_array))
+
+ # this array holds a number for each exon indicating its number of matches
+ exonIdentities = [0]*num_exons
+ exonGaps = [0]*num_exons
+
+ gaps = 0
+ identity = 0
+ idx = 0
+
+ for ridx in range(len(read_array)):
+ read_elem = read_array[ridx]
+ dna_elem = dna_array[ridx]
+
+ if ridx > 0 and read_array[ridx-1] != '_' and read_array[ridx] == '_':
+ idx += 1
+ gaps = 0
+ identity = 0
+
+ if read_elem == '_':
+ continue
+
+ if read_elem == dna_elem:
+ identity += 1
+
+ if read_elem == '-':
+ gaps += 1
+
+ exonIdentities[idx] = identity
+ exonGaps[idx] = gaps
+
+ return exonIdentities,exonGaps
+
+
+def createGenefindingOutput(current_loc,out_fname):
+ """
+ """
+ pass
+
+def createShoreOutput(current_loc,out_fname):
+ """
+ """
+ pass
+
+
+def createBLATOutput(current_loc,out_fname):
+
+ out_fh = open(out_fname,'w+')
+ allPredictions = cPickle.load(open(current_loc))
+ #allPredictions = current_loc
+
+ print 'Loaded %d examples' % len(allPredictions)
+
+ # fetch the data needed
+ accessWrapper = DataAccessWrapper(settings)
+ seqInfo = SeqSpliceInfo(accessWrapper,,settings['allowed_fragments'])
+
+ # Uniqueness filtering using alignment score for reads with several
+ # alignments
+ allUniquePredictions = {}
+
+ for new_prediction in allPredictions:
+ id = new_prediction['id']
+ if allUniquePredictions.has_key(id):
+ current_prediction = allUniquePredictions[id]
+
+ current_a_score = current_prediction['DPScores'].flatten().tolist()[0][0]
+ new_score = new_prediction['DPScores'].flatten().tolist()[0][0]
+
+ if current_a_score < new_score :
+ allUniquePredictions[id] = new_prediction
+
+ else:
+ allUniquePredictions[id] = new_prediction
+
+ written_lines = 0
+ for pred_idx,current_prediction in enumerate(allUniquePredictions.values()):
+
+ id = current_prediction['id']
+
+ seq = current_prediction['read']
+ dna = current_prediction['dna']
+
+ # CHECK !!!
+ q1 = 'zzz'
+
+ chromo = current_prediction['chr']
+ strand = current_prediction['strand']
+ start_pos = current_prediction['start_pos']
+ alignment = current_prediction['alignment']
+
+ DPScores = current_prediction['DPScores']
+ predExons = current_prediction['predExons']
+ predExons = [e+start_pos for e in predExons]
+
+ (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
+ tExonSizes,tStarts, tEnds) = alignment
+
+ if len(qExonSizes) != num_exons:
+ print 'BUG exon sizes %d'%id
+ continue
+
+ EST2GFF = False
+ new_line = ''
+
+ if EST2GFF:
+ predExons = numpy.mat(predExons).reshape(len(predExons)/2,2)
+
+ match = predExons[0,1] - predExons[0,0]
+ if predExons.shape == (2,2):
+ match += predExons[1,1] - predExons[1,0]
+
+ mismatch = 0
+ repmatch = 0
+ Ns = 0
+ Qgapcnt = 0
+ Qgapbs = 0
+
+ Tgapcnt = 0
+ Tgapbs = 0
+
+ if predExons.shape == (2,2):
+ Tgapbs += predExons[1,0] - predExons[0,1]
+ Tgapcnt = 1
+
+ # &strand, Qname, &Qsize,
+ # &Qstart, &Qend, Tname, &Tsize,
+ # &Tstart, &Tend, &blockcnt,exonsizes_str,Qstarts_str,Tstarts_str
+ #
+ # ATTENTION
+ #
+ # we enforce equal exons sizes for q and t because we are missing indel
+ # positions of the alignment
+
+ if qExonSizes[0] != tExonSizes[0]:
+ continue
+
+ Qname = '%d(%2.2f)'%(id,DPScores.tolist()[0][0])
+ Qsize = len(seq)
+ Qstart = qStart
+ Qend = qEnd
+ Tname = 'CHR%d'%chromo
+
+ start_pos -= 2
+
+ Tsize = tEnd+1 + start_pos
+ Tstart = tStart + start_pos
+ Tend = tEnd + start_pos
+ blockcnt = Tgapcnt+1
+ exonsizes_str = str(tExonSizes)[1:-1].replace(' ','')
+ Qstarts_str = str(qStarts)[1:-1].replace(' ','')
+ Tstarts_str = str(map(lambda x: x+start_pos,tStarts))[1:-1].replace(' ','')
+
+ new_line = "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%c\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n"\
+ % (match, mismatch, repmatch, Ns, Qgapcnt, Qgapbs,\
+ Tgapcnt, Tgapbs, strand, Qname, Qsize, Qstart, Qend,\
+ Tname, Tsize, Tstart, Tend,\
+ blockcnt,exonsizes_str,Qstarts_str,Tstarts_str) #str(predExons))
+
+ else:
+ #try:
+ exonIdentities,exonGaps = alignment_reconstruct(current_prediction,num_exons)
+ #except:
+ # print 'Bug in example %d (idx: %d)' %\
+ # (current_prediction['id'],pred_idx)
+ # continue
+
+ #t_size = tEnd - tStart
+ #read_size = 38
+ #if strand == '-':
+ # total_size = seqInfo.chromo_sizes[chromo]
+
+ # start_pos = total_size - start_pos
+
+ # qExonSizes.reverse()
+ # qStarts = [read_size-e for e in qEnds]
+ # qStarts.reverse()
+ # qEnds = [read_size-e for e in qStarts]
+ # qEnds.reverse()
+
+ # tExonSizes.reverse()
+ # tStarts = [t_size-e for e in tEnds]
+ # tStarts.reverse()
+ # tEnds = [t_size-e for e in tStarts]
+ # tEnds.reverse()
+
+ # exonIdentities.reverse()
+ # exonGaps.reverse()
+
+ pp = lambda x : str(x)[1:-1].replace(' ','')
+
+ new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
+ (id,chromo,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
+ pp(qExonSizes),pp(qStarts),\
+ pp(qEnds),pp(tExonSizes),\
+ pp(tStarts),pp(tEnds),\
+ pp(exonIdentities),pp(exonGaps))
+
+ out_fh.write(new_line)
+ written_lines += 1
+
+ print 'Wrote out %d lines' % written_lines