+ added alignment reconstruction function
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 18 Jun 2008 12:59:28 +0000 (12:59 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 18 Jun 2008 12:59:28 +0000 (12:59 +0000)
+ add script for est2gff processing

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@9653 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/Utils.py
scripts/createAlignmentFileFromPrediction.py
scripts/est2gff_pipeline.sh
scripts/grid_alignment.py
scripts/qpalma_main.py

index 74d83a4..1c3c4f7 100644 (file)
@@ -131,7 +131,8 @@ def pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array):
          dna_array[idx] = translation[int(dna_array[idx])]
          est_array[idx] = translation[int(est_array[idx])]
    except:
-      pdb.set_trace()
+      #pdb.set_trace()
+      pass
 
    line1 = "".join(dna_array)
    line2 = comparison
index 54d3beb..922dc70 100644 (file)
@@ -11,6 +11,68 @@ import numpy
 
 import qparser
 
+from Utils import pprint_alignment
+
+import palma.palma_utils as pu
+from palma.output_formating import print_results
+
+
+def alignment_reconstruct(current_prediction,num_exons):
+   """
+   We reconstruct the exact alignment to infer the positions of indels and the
+   sizes of the respective exons.
+   """
+
+   SpliceAlign = current_prediction['spliceAlign']
+   estAlign    = current_prediction['estAlign']
+
+   dna_array   = current_prediction['dna_array']
+   est_array   = current_prediction['est_array']
+
+   dna_array   = "".join(map(lambda x: str(x),dna_array))
+   est_array   = "".join(map(lambda x: str(x),est_array))
+
+   translation = '-ACGTN_'    #how aligned est and dna sequence is displayed
+                              #(gap_char, 5 nucleotides, intron_char) 
+
+   # this array hold a number for each exons indicating its number of matches
+   exonIdentities = [0]*num_exons
+   exonGaps = [0]*num_exons
+
+   est_array = map(lambda x: translation[int(x)],est_array)
+   dna_array = map(lambda x: translation[int(x)],dna_array)
+
+   gaps = 0
+   identity = 0
+   idx = 0
+
+   #if num_exons > 1:
+   #   pdb.set_trace()
+
+   for ridx in range(len(est_array)):
+      read_elem = est_array[ridx]
+      dna_elem = dna_array[ridx]
+
+      if ridx > 0 and est_array[ridx-1] != '_' and est_array[ridx] == '_':
+         idx += 1
+         gaps = 0
+         identity = 0
+         
+      if read_elem == '_':
+         continue
+
+      if read_elem == dna_elem:
+         identity += 1
+
+      if read_elem == '-':
+         gaps += 1
+      
+      exonIdentities[idx] = identity
+      exonGaps[idx] = gaps
+
+   return exonIdentities,exonGaps
+
+
 def create_alignment_file(current_loc,out_fname):
    import numpy
    #qparser.parse_reads(filtered_reads)
@@ -18,6 +80,7 @@ def create_alignment_file(current_loc,out_fname):
    out_fh = open(out_fname,'w+')
 
    allPredictions = cPickle.load(open(current_loc))
+   #allPredictions = current_loc
 
    allPositions = {}
 
@@ -45,12 +108,12 @@ def create_alignment_file(current_loc,out_fname):
       predExons = current_prediction['predExons']
       predExons = [e+start_pos for e in predExons]
 
-
       (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
       tExonSizes,tStarts, tEnds) = alignment 
 
-      
       EST2GFF = False
+      new_line = ''
+
       if EST2GFF:
          predExons = numpy.mat(predExons).reshape(len(predExons)/2,2)
 
@@ -74,7 +137,6 @@ def create_alignment_file(current_loc,out_fname):
          # &strand, Qname, &Qsize, 
          # &Qstart, &Qend, Tname, &Tsize,
          # &Tstart, &Tend, &blockcnt,exonsizes_str,Qstarts_str,Tstarts_str
-
          #
          # ATTENTION
          #  
@@ -100,25 +162,31 @@ def create_alignment_file(current_loc,out_fname):
          Qstarts_str = str(qStarts)[1:-1].replace(' ','')
          Tstarts_str = str(map(lambda x: x+start_pos,tStarts))[1:-1].replace(' ','')
 
-      try:
-         new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
+         new_line = "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%c\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n"\
+         % (match, mismatch, repmatch, Ns, Qgapcnt, Qgapbs,\
+         Tgapcnt, Tgapbs, strand, Qname, Qsize, Qstart, Qend,\
+         Tname, Tsize, Tstart, Tend,\
+         blockcnt,exonsizes_str,Qstarts_str,Tstarts_str) #str(predExons))
+
+      else:
+
+         num_exons = len(qExonSizes)
+         exonIdentities,exonGaps = alignment_reconstruct(current_prediction,num_exons)
+
+         pp = lambda x : str(x)[1:-1].replace(' ','')
+
+         new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
          (id,chromo,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
          str(qExonSizes)[1:-1].replace(' ',''),str(qStarts)[1:-1].replace(' ',''),\
          str(qEnds)[1:-1].replace(' ',''),str(tExonSizes)[1:-1].replace(' ',''),\
-         str(tStarts)[1:-1].replace(' ',''),str(tEnds)[1:-1].replace(' ',''))
-      except:
-         pdb.set_trace()
-
-      #new_line = "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%c\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n"\
-      #% (match, mismatch, repmatch, Ns, Qgapcnt, Qgapbs,\
-      #Tgapcnt, Tgapbs, strand, Qname, Qsize, Qstart, Qend,\
-      #Tname, Tsize, Tstart, Tend,\
-      #blockcnt,exonsizes_str,Qstarts_str,Tstarts_str) #str(predExons))
+         str(tStarts)[1:-1].replace(' ',''),str(tEnds)[1:-1].replace(' ',''),\
+         str(exonIdentities)[1:-1].replace(' ',''),str(exonGaps)[1:-1].replace(' ',''))
 
       out_fh.write(new_line)
 
    return allPositions
 
+
 if __name__ == '__main__':
    current_dir = sys.argv[1]
    out_fh = sys.argv[2]
index ec98957..926df1d 100755 (executable)
@@ -20,7 +20,7 @@ do
    for STRAND in "+" "-"
    do
       echo $CHR $STRAND
-      result1_fn=/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/SpliceGraphResults/gff/myexons_${CHR}${STRAND}.gff
+      result1_fn=/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/SpliceGraphResults/gff/test_myexons_${CHR}${STRAND}.gff
       result2_fn=/dev/null
    
       /fml/ag-raetsch/home/fabio/svn/projects/splicing/gff/bin/psl2gff EST $alignment_file $CHR ${STRAND} $result1_fn $result2_fn $g_config 1>>LOG
index c2b8cff..0959c5d 100644 (file)
@@ -24,7 +24,9 @@ def g_alignment(chunk_fn,result_fn):
 def create_and_submit():
    jp = os.path.join
 
-   run_dir  = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/run_enable_quality_scores_+_enable_splice_signals_+_enable_intron_length_+'
+   #run_dir  = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/run_enable_quality_scores_+_enable_splice_signals_+_enable_intron_length_+'
+   run_dir  = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/saved_run'
+   
    data_dir = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data'
 
    chunks_fn = []
@@ -32,6 +34,20 @@ def create_and_submit():
       if fn.startswith('chunk'):
          chunks_fn.append(fn)
 
+
+   chunks_fn = [\
+   'chunk_17.predictions.pickle',\
+   'chunk_18.predictions.pickle',\
+   'chunk_19.predictions.pickle',\
+   'chunk_20.predictions.pickle',\
+   'chunk_21.predictions.pickle',\
+   'chunk_23.predictions.pickle',\
+   'chunk_24.predictions.pickle'
+   ]
+
+   print chunks_fn
+
+
    functionJobs=[]
 
    for chunk_fn in chunks_fn:
index cf97fb4..d0a17ad 100644 (file)
@@ -742,8 +742,8 @@ class QPalma:
        
       alignment = get_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array) #(qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds, tExonSizes, tStarts, tEnds)
 
-      dna_array = array.array('c',dna_array)
-      est_array = array.array('c',est_array)
+      dna_array = array.array('B',dna_array)
+      est_array = array.array('B',est_array)
 
       #line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
       #self.plog(line1+'\n')