+ added alignment reconstruction function
[qpalma.git] / scripts / createAlignmentFileFromPrediction.py
index cc37313..922dc70 100644 (file)
@@ -7,143 +7,187 @@ import pdb
 import os
 import os.path
 import math
-from qpalma.parsers import *
+import numpy
 
-from Evaluation import load_chunks
+import qparser
 
-def prediction_on(current_dir,filtered_reads,out_fname):
-    
-   #print 'parsing filtered reads..'
-   #all_filtered_reads = parse_filtered_reads(filtered_reads)
-   #print 'found %d filtered reads' % len(all_filtered_reads)
+from Utils import pprint_alignment
 
-   import qparser
-   qparser.parse_reads(filtered_reads)
+import palma.palma_utils as pu
+from palma.output_formating import print_results
+
+
+def alignment_reconstruct(current_prediction,num_exons):
+   """
+   We reconstruct the exact alignment to infer the positions of indels and the
+   sizes of the respective exons.
+   """
+
+   SpliceAlign = current_prediction['spliceAlign']
+   estAlign    = current_prediction['estAlign']
+
+   dna_array   = current_prediction['dna_array']
+   est_array   = current_prediction['est_array']
+
+   dna_array   = "".join(map(lambda x: str(x),dna_array))
+   est_array   = "".join(map(lambda x: str(x),est_array))
+
+   translation = '-ACGTN_'    #how aligned est and dna sequence is displayed
+                              #(gap_char, 5 nucleotides, intron_char) 
+
+   # this array hold a number for each exons indicating its number of matches
+   exonIdentities = [0]*num_exons
+   exonGaps = [0]*num_exons
+
+   est_array = map(lambda x: translation[int(x)],est_array)
+   dna_array = map(lambda x: translation[int(x)],dna_array)
+
+   gaps = 0
+   identity = 0
+   idx = 0
+
+   #if num_exons > 1:
+   #   pdb.set_trace()
+
+   for ridx in range(len(est_array)):
+      read_elem = est_array[ridx]
+      dna_elem = dna_array[ridx]
+
+      if ridx > 0 and est_array[ridx-1] != '_' and est_array[ridx] == '_':
+         idx += 1
+         gaps = 0
+         identity = 0
+         
+      if read_elem == '_':
+         continue
+
+      if read_elem == dna_elem:
+         identity += 1
+
+      if read_elem == '-':
+         gaps += 1
+      
+      exonIdentities[idx] = identity
+      exonGaps[idx] = gaps
+
+   return exonIdentities,exonGaps
+
+
+def create_alignment_file(current_loc,out_fname):
+   import numpy
+   #qparser.parse_reads(filtered_reads)
 
    out_fh = open(out_fname,'w+')
 
-   allPredictions = load_chunks(current_dir)
+   allPredictions = cPickle.load(open(current_loc))
+   #allPredictions = current_loc
+
    allPositions = {}
 
    for current_prediction in allPredictions:
       id = current_prediction['id']
 
-      #current_ground_truth = all_filtered_reads[id]
-      current_ground_truth = qparser.fetch_read(id)
+      #current_ground_truth = qparser.fetch_read(id)
+      #true_cut = current_ground_truth['true_cut']
+      #seq = current_ground_truth['seq']
+      #q1 = current_ground_truth['prb']
 
-      true_cut = current_ground_truth['true_cut']
-      seq = current_ground_truth['seq']
-      q1 = current_ground_truth['prb']
+      seq         = current_prediction['est']
+      dna         = current_prediction['dna']
 
-      chr         = current_prediction['chr']
+      # CHECK !!!
+      q1          = 'zzz'
+
+      chromo      = current_prediction['chr']
       strand      = current_prediction['strand']
-      #true_cut    = current_prediction['true_cut']
       start_pos   = current_prediction['start_pos']
       alignment   = current_prediction['alignment']
 
+      DPScores   = current_prediction['DPScores']
+
       predExons = current_prediction['predExons']
       predExons = [e+start_pos for e in predExons]
 
-      p_start  = current_ground_truth['p_start']
-      e_stop   = current_ground_truth['exon_stop']
-      e_start  = current_ground_truth['exon_start']
-      p_stop   = current_ground_truth['p_stop']
-      cut_pos = current_ground_truth['true_cut']
-
-      correct_prediction = False
-
-      #if len(predExons) == 4:
-      #   spliced_flag = True
-      #   predExons[1] -= 1
-      #   predExons[3] -= 1
-
-      #   if p_start == predExons[0] and e_stop == predExons[1] and\
-      #   e_start == predExons[2] and p_stop == predExons[3]:
-      #      correct_prediction = True
-
-      #   if p_start == predExons[0] and e_stop == predExons[1] and\
-      #   e_start == predExons[2] and p_stop+1 == predExons[3]:
-      #      print 'special case'
-
-      #elif len(predExons) == 2:
-      #   spliced_flag = False
-      #   predExons[1] -= 1
-
-      #   if math.fabs(p_start - predExons[0]) <= 0 and math.fabs(p_stop - predExons[1]) <= 2:
-      #      correct_prediction = True
-      #      
-      #else:
-      #   pass
-
       (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
       tExonSizes,tStarts, tEnds) = alignment 
 
-      new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
-      (id,chr,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
-      str(qExonSizes)[1:-1].replace(' ',''),str(qStarts)[1:-1].replace(' ',''),\
-      str(qEnds)[1:-1].replace(' ',''),str(tExonSizes)[1:-1].replace(' ',''),\
-      str(tStarts)[1:-1].replace(' ',''),str(tEnds)[1:-1].replace(' ',''))
+      EST2GFF = False
+      new_line = ''
+
+      if EST2GFF:
+         predExons = numpy.mat(predExons).reshape(len(predExons)/2,2)
+
+         match = predExons[0,1] - predExons[0,0] 
+         if predExons.shape == (2,2):
+            match += predExons[1,1] - predExons[1,0] 
+
+         mismatch = 0
+         repmatch = 0
+         Ns = 0
+         Qgapcnt = 0
+         Qgapbs = 0
+
+         Tgapcnt = 0
+         Tgapbs = 0
+
+         if predExons.shape == (2,2):
+            Tgapbs += predExons[1,0] - predExons[0,1]
+            Tgapcnt = 1
+
+         # &strand, Qname, &Qsize, 
+         # &Qstart, &Qend, Tname, &Tsize,
+         # &Tstart, &Tend, &blockcnt,exonsizes_str,Qstarts_str,Tstarts_str
+         #
+         # ATTENTION
+         #  
+         # we enforce equal exons sizes for q and t because we are missing indel
+         # positions of the alignment
+
+         if qExonSizes[0] != tExonSizes[0]:
+            continue
+
+         Qname = '%d(%2.2f)'%(id,DPScores.tolist()[0][0])
+         Qsize = len(seq)
+         Qstart = qStart
+         Qend = qEnd
+         Tname = 'CHR%d'%chromo
+
+         start_pos -= 2
+
+         Tsize = tEnd+1 + start_pos
+         Tstart =  tStart + start_pos
+         Tend = tEnd + start_pos
+         blockcnt = Tgapcnt+1
+         exonsizes_str = str(tExonSizes)[1:-1].replace(' ','')
+         Qstarts_str = str(qStarts)[1:-1].replace(' ','')
+         Tstarts_str = str(map(lambda x: x+start_pos,tStarts))[1:-1].replace(' ','')
+
+         new_line = "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%c\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n"\
+         % (match, mismatch, repmatch, Ns, Qgapcnt, Qgapbs,\
+         Tgapcnt, Tgapbs, strand, Qname, Qsize, Qstart, Qend,\
+         Tname, Tsize, Tstart, Tend,\
+         blockcnt,exonsizes_str,Qstarts_str,Tstarts_str) #str(predExons))
+
+      else:
+
+         num_exons = len(qExonSizes)
+         exonIdentities,exonGaps = alignment_reconstruct(current_prediction,num_exons)
+
+         pp = lambda x : str(x)[1:-1].replace(' ','')
+
+         new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
+         (id,chromo,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
+         str(qExonSizes)[1:-1].replace(' ',''),str(qStarts)[1:-1].replace(' ',''),\
+         str(qEnds)[1:-1].replace(' ',''),str(tExonSizes)[1:-1].replace(' ',''),\
+         str(tStarts)[1:-1].replace(' ',''),str(tEnds)[1:-1].replace(' ',''),\
+         str(exonIdentities)[1:-1].replace(' ',''),str(exonGaps)[1:-1].replace(' ',''))
 
       out_fh.write(new_line)
 
    return allPositions
 
 
-#def writePredictions(fname,allPositions):
-#
-#
-#   allEntries = {}
-#
-#   for line in open('/fml/ag-raetsch/share/projects/qpalma/solexa/allReads.pipeline'):
-#      line = line.strip()
-#      id,seq,q1,q2,q3 = line.split()
-#      id = int(id)
-#      
-#      allEntries[id] = (seq,q1,q2,q3)
-#
-#
-#   for id,elems in allPositions.items():
-#      seq,q1,q2,q3 = allEntries[id]
-#      chr,strand,start_pos,true_cut,p1,p2,p3,p4,alignment = elems
-#
-#      p1 += start_pos
-#      p2 += start_pos
-#      p3 += start_pos
-#      p4 += start_pos
-#
-#   out_fh.close()
-#
-
-#def collect_prediction(current_dir):
-#   """
-#   Given the toplevel directory this function takes care that for each distinct
-#   experiment the training and test predictions are evaluated.
-#
-#   """
-#   train_suffix   = '_allPredictions_TRAIN'
-#   test_suffix    = '_allPredictions_TEST'
-#
-#   run_name = 'run_+_quality_+_splicesignals_+_intron_len_1'
-#
-#   jp = os.path.join
-#   b2s = ['-','+']
-#
-#   #currentRun = cPickle.load(open(jp(current_dir,'run_object_1.pickle')))
-#   #QFlag    = currentRun['enable_quality_scores']
-#   #SSFlag   = currentRun['enable_splice_signals']
-#   #ILFlag   = currentRun['enable_intron_length']
-#   #currentRunId = '%s%s%s' % (b2s[QFlag],b2s[SSFlag],b2s[ILFlag])
-#   
-#   filename =  jp(current_dir,run_name)+test_suffix
-#   print 'Prediction on: %s' % filename
-#   test_result = prediction_on(filename)
-#
-#   fname = 'predictions.txt'
-#   writePredictions(fname,test_result)
-#
-
 if __name__ == '__main__':
    current_dir = sys.argv[1]
-   filtered_reads = sys.argv[2]
-   out_fh = sys.argv[3]
-   prediction_on(current_dir,filtered_reads,out_fh)
+   out_fh = sys.argv[2]
+   create_alignment_file(current_dir,out_fh)