+ removed some obsolete dependencies
[qpalma.git] / scripts / createAlignmentFileFromPrediction.py
index 9e97190..e263395 100644 (file)
 # -*- coding: utf-8 -*- 
 
 import cPickle
-import sys
-import pdb
-import os
-import os.path
 import math
 import numpy
+import os
+import os.path
+import pdb
+import sys
 
-#from qpalma.parsers import *
+from Utils import pprint_alignment
 
-from Evaluation import load_chunks
-import qparser
+import palma.palma_utils as pu
+from palma.output_formating import print_results
 
+from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper
 
-def create_aglignment_file(allPredictions,out_fname):
-   import numpy
-   #qparser.parse_reads(filtered_reads)
 
-   out_fh = open(out_fname,'w+')
+def alignment_reconstruct(current_prediction,num_exons):
+   """
+   We reconstruct the exact alignment to infer the positions of indels and the
+   sizes of the respective exons.
+   """
+   translation = '-acgtn_' # how aligned est and dna sequence is displayed (gap_char, 5 nucleotides, intron_char) 
 
-   #allPredictionsmllPredictions = load_chunks(current_loc)
-   #allPredictions = cPickle.load(open(current_loc))
+   SpliceAlign = current_prediction['spliceAlign']
+   estAlign    = current_prediction['estAlign']
 
-   allPositions = {}
+   dna_array   = current_prediction['dna_array']
+   read_array  = current_prediction['read_array']
 
-   for current_prediction in allPredictions:
-      id = current_prediction['id']
+   dna_array   = "".join(map(lambda x: translation[int(x)],dna_array))
+   read_array  = "".join(map(lambda x: translation[int(x)],read_array))
 
-      #current_ground_truth = qparser.fetch_read(id)
-      #true_cut = current_ground_truth['true_cut']
-      #seq = current_ground_truth['seq']
-      #q1 = current_ground_truth['prb']
+   # this array holds a number for each exon indicating its number of matches
+   exonIdentities = [0]*num_exons
+   exonGaps       = [0]*num_exons
 
-      seq         = current_prediction['est']
-      dna         = current_prediction['dna']
+   gaps     = 0
+   identity = 0
+   idx      = 0
 
-      # CHECK !!!
-      q1          = 'zzz'
+   for ridx in range(len(read_array)):
+      read_elem = read_array[ridx]
+      dna_elem = dna_array[ridx]
 
-      chromo      = current_prediction['chr']
-      strand      = current_prediction['strand']
-      start_pos   = current_prediction['start_pos']
-      alignment   = current_prediction['alignment']
+      if ridx > 0 and read_array[ridx-1] != '_' and read_array[ridx] == '_':
+         idx += 1
+         gaps = 0
+         identity = 0
+         
+      if read_elem == '_':
+         continue
+
+      if read_elem == dna_elem:
+         identity += 1
+
+      if read_elem == '-':
+         gaps += 1
+      
+      exonIdentities[idx] = identity
+      exonGaps[idx] = gaps
+   
+   return exonIdentities,exonGaps
 
-      predExons = current_prediction['predExons']
-      predExons = [e+start_pos for e in predExons]
 
-      #p_start  = current_ground_truth['p_start']
-      #e_stop   = current_ground_truth['exon_stop']
-      #e_start  = current_ground_truth['exon_start']
-      #p_stop   = current_ground_truth['p_stop']
+def create_alignment_file(current_loc,out_fname):
 
-      # &match, &mismatch, &repmatch, &Ns, &Qgapcnt, &Qgapbs, &Tgapcnt, &Tgapbs,                                                                                                                                              
-      predExons = numpy.mat(predExons).reshape(len(predExons)/2,2)
-      #predExons = predExons.T
+   out_fh = open(out_fname,'w+')
+   allPredictions = cPickle.load(open(current_loc))
+   #allPredictions = current_loc
 
-      #print predExons
-      #pdb.set_trace()
+   print 'Loaded %d examples' % len(allPredictions)
 
-      match = predExons[0,1] - predExons[0,0] 
-      if predExons.shape == (2,2):
-         match += predExons[1,1] - predExons[1,0] 
+   # fetch the data needed
+   g_dir    = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+   acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
+   don_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
 
-      mismatch = 0
-      repmatch = 0
-      Ns = 0
-      Qgapcnt = 0
-      Qgapbs = 0
+   g_fmt = 'chr%d.dna.flat'
+   s_fmt = 'contig_%d%s'
 
-      Tgapcnt = 0
-      Tgapbs = 0
+   num_chromo = 6
 
-      if predExons.shape == (2,2):
-         Tgapbs += predExons[1,0] - predExons[0,1]
-         Tgapcnt = 1
+   accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
+   seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
 
-      # &strand, Qname, &Qsize, 
-      
-      # &Qstart, &Qend, Tname, &Tsize,
-      # &Tstart, &Tend, &blockcnt,exonsizes_str,Qstarts_str,Tstarts_str
+   # Uniqueness filtering using alignment score for reads with several
+   # alignments
+   allUniquePredictions = {}
 
-      (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
-      tExonSizes,tStarts, tEnds) = alignment 
+   for new_prediction in allPredictions:
+      id = new_prediction['id']
+      if allUniquePredictions.has_key(id):
+         current_prediction = allUniquePredictions[id]
 
-      #
-      # ATTENTION
-      #  
-      # we enforce equal exons sizes for q and t because we are missing indel
-      # positions of the alignment
+         current_a_score = current_prediction['DPScores'].flatten().tolist()[0][0]
+         new_score =  new_prediction['DPScores'].flatten().tolist()[0][0]
 
-      if qExonSizes[0] != tExonSizes[0]:
-         continue
+         if current_a_score < new_score :
+            allUniquePredictions[id] = new_prediction
+
+      else:
+         allUniquePredictions[id] = new_prediction
+
+   written_lines = 0
+   for pred_idx,current_prediction in enumerate(allUniquePredictions.values()):
+
+      id = current_prediction['id']
+
+      seq         = current_prediction['read']
+      dna         = current_prediction['dna']
 
-      Qname = str(id)
-      Qsize = len(seq)
-      Qstart = qStart
-      Qend = qEnd
-      Tname = 'CHR%d'%chromo
+      # CHECK !!!
+      q1          = 'zzz'
 
-      start_pos -= 2
+      chromo      = current_prediction['chr']
+      strand      = current_prediction['strand']
+      start_pos   = current_prediction['start_pos']
+      alignment   = current_prediction['alignment']
 
-      Tsize = tEnd+1 + start_pos
-      Tstart =  tStart + start_pos
-      Tend = tEnd + start_pos
-      blockcnt = Tgapcnt+1
-      exonsizes_str = str(tExonSizes)[1:-1].replace(' ','')
-      Qstarts_str = str(qStarts)[1:-1].replace(' ','')
-      Tstarts_str = str(map(lambda x: x+start_pos,tStarts))[1:-1].replace(' ','')
+      DPScores    = current_prediction['DPScores']
+      predExons   = current_prediction['predExons']
+      predExons   = [e+start_pos for e in predExons]
 
-      #new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
-      #(id,chr,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
-      #str(qExonSizes)[1:-1].replace(' ',''),str(qStarts)[1:-1].replace(' ',''),\
-      #str(qEnds)[1:-1].replace(' ',''),str(tExonSizes)[1:-1].replace(' ',''),\
-      #str(tStarts)[1:-1].replace(' ',''),str(tEnds)[1:-1].replace(' ',''))
+      (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
+      tExonSizes,tStarts, tEnds) = alignment
 
-      new_line = "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%c\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n"\
-      % (match, mismatch, repmatch, Ns, Qgapcnt, Qgapbs,\
-      Tgapcnt, Tgapbs, strand, Qname, Qsize, Qstart, Qend,\
-      Tname, Tsize, Tstart, Tend,\
-      blockcnt,exonsizes_str,Qstarts_str,Tstarts_str) #str(predExons))
+      if len(qExonSizes) != num_exons:
+         print 'BUG exon sizes %d'%id
+         continue
 
+      EST2GFF = False
+      new_line = ''
+
+      if EST2GFF:
+         predExons = numpy.mat(predExons).reshape(len(predExons)/2,2)
+
+         match = predExons[0,1] - predExons[0,0] 
+         if predExons.shape == (2,2):
+            match += predExons[1,1] - predExons[1,0] 
+
+         mismatch = 0
+         repmatch = 0
+         Ns = 0
+         Qgapcnt = 0
+         Qgapbs = 0
+
+         Tgapcnt = 0
+         Tgapbs = 0
+
+         if predExons.shape == (2,2):
+            Tgapbs += predExons[1,0] - predExons[0,1]
+            Tgapcnt = 1
+
+         # &strand, Qname, &Qsize, 
+         # &Qstart, &Qend, Tname, &Tsize,
+         # &Tstart, &Tend, &blockcnt,exonsizes_str,Qstarts_str,Tstarts_str
+         #
+         # ATTENTION
+         #  
+         # we enforce equal exons sizes for q and t because we are missing indel
+         # positions of the alignment
+
+         if qExonSizes[0] != tExonSizes[0]:
+            continue
+
+         Qname = '%d(%2.2f)'%(id,DPScores.tolist()[0][0])
+         Qsize = len(seq)
+         Qstart = qStart
+         Qend = qEnd
+         Tname = 'CHR%d'%chromo
+
+         start_pos -= 2
+
+         Tsize = tEnd+1 + start_pos
+         Tstart =  tStart + start_pos
+         Tend = tEnd + start_pos
+         blockcnt = Tgapcnt+1
+         exonsizes_str = str(tExonSizes)[1:-1].replace(' ','')
+         Qstarts_str = str(qStarts)[1:-1].replace(' ','')
+         Tstarts_str = str(map(lambda x: x+start_pos,tStarts))[1:-1].replace(' ','')
+
+         new_line = "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%c\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n"\
+         % (match, mismatch, repmatch, Ns, Qgapcnt, Qgapbs,\
+         Tgapcnt, Tgapbs, strand, Qname, Qsize, Qstart, Qend,\
+         Tname, Tsize, Tstart, Tend,\
+         blockcnt,exonsizes_str,Qstarts_str,Tstarts_str) #str(predExons))
+
+      else:
+         #try:
+         exonIdentities,exonGaps = alignment_reconstruct(current_prediction,num_exons)
+         #except:
+         #   print 'Bug in example %d (idx: %d)' %\
+         #   (current_prediction['id'],pred_idx)
+         #   continue
+
+         #t_size = tEnd - tStart
+         #read_size = 38
+         #if strand == '-':
+         #   total_size = seqInfo.chromo_sizes[chromo]
+
+         #   start_pos = total_size - start_pos
+
+         #   qExonSizes.reverse()
+         #   qStarts  = [read_size-e for e in qEnds]
+         #   qStarts.reverse()
+         #   qEnds  = [read_size-e for e in qStarts]
+         #   qEnds.reverse()
+
+         #   tExonSizes.reverse()
+         #   tStarts  = [t_size-e for e in tEnds]
+         #   tStarts.reverse()
+         #   tEnds    = [t_size-e for e in tStarts]
+         #   tEnds.reverse()
+
+         #   exonIdentities.reverse()
+         #   exonGaps.reverse()
+
+         pp = lambda x : str(x)[1:-1].replace(' ','')
+
+         new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
+         (id,chromo,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
+         pp(qExonSizes),pp(qStarts),\
+         pp(qEnds),pp(tExonSizes),\
+         pp(tStarts),pp(tEnds),\
+         pp(exonIdentities),pp(exonGaps))
 
       out_fh.write(new_line)
+      written_lines += 1
+
+   print 'Wrote out %d lines' % written_lines
 
-   return allPositions
 
 if __name__ == '__main__':
    current_dir = sys.argv[1]
    out_fh = sys.argv[2]
-   create_aglignment_file(current_dir,out_fh)
+   create_alignment_file(current_dir,out_fh)