+ combined two scripts that were responsible for generating the output format
authorFabio <fabio@congo.fml.local>
Tue, 30 Sep 2008 10:17:59 +0000 (12:17 +0200)
committerFabio <fabio@congo.fml.local>
Tue, 30 Sep 2008 10:17:59 +0000 (12:17 +0200)
qpalma/OutputFormat.py
qpalma/gridtools.py
scripts/qpalma_main.py
tests/test_qpalma_prediction.py

index 4e9dd23..41d730a 100644 (file)
@@ -18,7 +18,11 @@ from qpalma.utils import pprint_alignment
 from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper
 
 
-translation = '-acgtn_' # how aligned est and dna sequence is displayed (gap_char, 5 nucleotides, intron_char) 
+# how aligned est and dna sequence is displayed (gap_char, 5 nucleotides, intron_char) 
+translation = '-acgtn_' 
+
+# convenience function for pretty-printing arrays
+pp = lambda x : str(x)[1:-1].replace(' ','')
 
 
 def alignment_reconstruct(current_prediction,num_exons):
@@ -71,7 +75,7 @@ def alignment_reconstruct(current_prediction,num_exons):
 def getUniquePrediction(allPredictions):
    """
    Since one read can have several alignments due to several possible seed
-   positions we have to preprocess all prediction. This function return for
+   positions we have to preprocess all predictions. This function returns for
    every read only one alignment that is the highest scoring under the QPalma
    model.
    """
@@ -95,14 +99,34 @@ def getUniquePrediction(allPredictions):
    return allUniquePredictions 
 
 
-def createOutput(current_loc,out_fname,output_format):
+def recalculatePositions(chromo,start_pos,strand,start,ends,seqInfo,window_size):
+   """
+   If we work on the negative strand we have to recalculate the indices other
+   wise we just have to add an offset i.e. the start position.
+   """
+
+   if strand == '-':
+      total_size = seqInfo.chromo_sizes[chromo]
+      start_pos = total_size - start_pos
+
+   starts   = [int(elem) + start_pos for elem in starts]
+   ends     = [int(elem) + start_pos for elem in ends]
+
+   if strand == '-':
+      starts   = [e-window_size for e in starts]
+      ends     = [e-window_size for e in ends]
+
+   return starts,ends
+
+
+def createAlignmentOutput(settings,chunk_fn,result_fn):
    """
    Given all the predictions and a result file name. This function calculates
    all aligmnment information and returns the result in the specified format.
    """
 
-   out_fh = open(out_fname,'w+')
-   allPredictions = cPickle.load(open(current_loc))
+   out_fh = open(result_fn,'w+')
+   allPredictions = cPickle.load(open(chunk_fn))
    #allPredictions = current_loc
 
    print 'Loaded %d examples' % len(allPredictions)
@@ -114,20 +138,22 @@ def createOutput(current_loc,out_fname,output_format):
    # make predictions unique
    allUniquePredictions = getUniquePrediction(allPredictions)
 
+   output_format  = settings['output_format']
+
    written_lines = 0
    for pred_idx,current_prediction in enumerate(allUniquePredictions.values()):
 
       if output_format == 'BLAT':
          # BLAT output
-         new_line = createBlatOutput(current_prediction)
+         new_line = createBlatOutput(current_prediction,settings)
 
       elif output_format == 'ShoRe':
          # ShoRe output
-         new_line = createShoReOutput(current_prediction)
+         new_line = createShoReOutput(current_prediction,settings)
 
       elif output_format == 'mGene':
          # Genefinding output for mGene
-         new_line = createGenefindingOutput(current_prediction)
+         new_line = createGenefindingOutput(current_prediction,settings)
 
       else:
          assert False
@@ -138,12 +164,21 @@ def createOutput(current_loc,out_fname,output_format):
    print 'Wrote out %d lines' % written_lines
 
 
-def createBlatOutput(current_prediction):
+def createBlatOutput(current_prediction,settings):
    """
    This function takes one prediction as input and returns the corresponding
    alignment in BLAT format.
    """
 
+   # fetch the data needed
+   allowed_fragments = settings['allowed_fragments']
+   accessWrapper     = DataAccessWrapper(settings)
+   seqInfo           = SeqSpliceInfo(accessWrapper,range(1,allowed_fragments ))
+
+   # this is the total size of the seed region
+   window_size = 2*settings['half_window_size']
+
+   # now we fetch the data of the read itself
    id = current_prediction['id']
 
    seq         = current_prediction['read']
@@ -158,8 +193,6 @@ def createBlatOutput(current_prediction):
    alignment   = current_prediction['alignment']
 
    DPScores    = current_prediction['DPScores']
-   predExons   = current_prediction['predExons']
-   predExons   = [e+start_pos for e in predExons]
 
    (qStart, qEnd, tStart, tEnd, num_exons, qExonSizes, qStarts, qEnds,\
    tExonSizes,tStarts, tEnds) = alignment
@@ -168,112 +201,41 @@ def createBlatOutput(current_prediction):
       print 'BUG exon sizes %d'%id
       continue
 
-   EST2GFF = False
    new_line = ''
-
-   if EST2GFF:
-      predExons = numpy.mat(predExons).reshape(len(predExons)/2,2)
-
-      match = predExons[0,1] - predExons[0,0] 
-      if predExons.shape == (2,2):
-         match += predExons[1,1] - predExons[1,0] 
-
-      mismatch = 0
-      repmatch = 0
-      Ns = 0
-      Qgapcnt = 0
-      Qgapbs = 0
-
-      Tgapcnt = 0
-      Tgapbs = 0
-
-      if predExons.shape == (2,2):
-         Tgapbs += predExons[1,0] - predExons[0,1]
-         Tgapcnt = 1
-
-      # &strand, Qname, &Qsize, 
-      # &Qstart, &Qend, Tname, &Tsize,
-      # &Tstart, &Tend, &blockcnt,exonsizes_str,Qstarts_str,Tstarts_str
-      #
-      # ATTENTION
-      #  
-      # we enforce equal exons sizes for q and t because we are missing indel
-      # positions of the alignment
-
-      if qExonSizes[0] != tExonSizes[0]:
-         continue
-
-      Qname = '%d(%2.2f)'%(id,DPScores.tolist()[0][0])
-      Qsize = len(seq)
-      Qstart = qStart
-      Qend = qEnd
-      Tname = 'CHR%d'%chromo
-
-      start_pos -= 2
-
-      Tsize = tEnd+1 + start_pos
-      Tstart =  tStart + start_pos
-      Tend = tEnd + start_pos
-      blockcnt = Tgapcnt+1
-      exonsizes_str = str(tExonSizes)[1:-1].replace(' ','')
-      Qstarts_str = str(qStarts)[1:-1].replace(' ','')
-      Tstarts_str = str(map(lambda x: x+start_pos,tStarts))[1:-1].replace(' ','')
-
-      new_line = "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%c\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%s\n"\
-      % (match, mismatch, repmatch, Ns, Qgapcnt, Qgapbs,\
-      Tgapcnt, Tgapbs, strand, Qname, Qsize, Qstart, Qend,\
-      Tname, Tsize, Tstart, Tend,\
-      blockcnt,exonsizes_str,Qstarts_str,Tstarts_str) #str(predExons))
-
-   else:
-      #try:
-      exonIdentities,exonGaps = alignment_reconstruct(current_prediction,num_exons)
-      #except:
-      #   print 'Bug in example %d (idx: %d)' %\
-      #   (current_prediction['id'],pred_idx)
-      #   continue
-
-      #t_size = tEnd - tStart
-      #read_size = 38
-      #if strand == '-':
-      #   total_size = seqInfo.chromo_sizes[chromo]
-
-      #   start_pos = total_size - start_pos
-
-      #   qExonSizes.reverse()
-      #   qStarts  = [read_size-e for e in qEnds]
-      #   qStarts.reverse()
-      #   qEnds  = [read_size-e for e in qStarts]
-      #   qEnds.reverse()
-
-      #   tExonSizes.reverse()
-      #   tStarts  = [t_size-e for e in tEnds]
-      #   tStarts.reverse()
-      #   tEnds    = [t_size-e for e in tStarts]
-      #   tEnds.reverse()
-
-      #   exonIdentities.reverse()
-      #   exonGaps.reverse()
-
-      pp = lambda x : str(x)[1:-1].replace(' ','')
-
-      new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
-      (id,chromo,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
-      pp(qExonSizes),pp(qStarts),\
-      pp(qEnds),pp(tExonSizes),\
-      pp(tStarts),pp(tEnds),\
-      pp(exonIdentities),pp(exonGaps))
+   #try:
+   exonIdentities,exonGaps = alignment_reconstruct(current_prediction,num_exons)
+   #except:
+   #   print 'Bug in example %d (idx: %d)' %\
+   #   (current_prediction['id'],pred_idx)
+   #   continue
+
+   tStarts,tEnds = recalculatePositions(chromo,start_pos,strand,tStarts,tEnds,seqInfo,window_size)
+
+   ids   = exonIdentities
+   ids   = [int(elem) for elem in ids]
+   gaps  = exonGaps
+   gaps  = [int(elem) for elem in gaps]
+
+   new_line = '%d\t%d\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %\
+   (id,chromo,strand,seq,str(q1)[1:-1],start_pos,qStart,qEnd,tStart,tEnd,num_exons,\
+   pp(qExonSizes),pp(qStarts),pp(qEnds),pp(tExonSizes),\
+   pp(tStarts),pp(tEnds),pp(ids),pp(gaps))
    
    return new_line
 
 
 def createGenefindingOutput(current_loc,out_fname):
    """
+   This is an output format used by mGene. It consists of positional informations to generate the exon and intron tracks.
    """
+
+   #out_fh.write("%d\t%s\t%s\t%s\t%s\t%s\n"%(chromo,strand,pp(starts),pp(ends),pp(ids),pp(gaps)))
    pass
 
 
 def createShoReOutput(current_loc,out_fname):
    """
+   This format is used by the additional steps implemented in the ShoRe pipeline.
    """
+
    pass
index 49bfc16..d7fd5f0 100644 (file)
@@ -1,6 +1,3 @@
-#!/usr/bin/env python 
-# -*- coding: utf-8 -*- 
-
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation; either version 2 of the License, or
@@ -22,7 +19,7 @@ from threading import Thread
 from pythongrid import KybJob, Usage
 from pythongrid import process_jobs, submit_jobs, collect_jobs, get_status
 
-from createAlignmentFileFromPrediction import create_alignment_file
+from qpalma.OutputFormat import createAlignmentOutput
 
 from PipelineHeuristic import *
 
@@ -291,7 +288,7 @@ class PostprocessingTask(ClusterTask):
 
          self.result_files.append(result_fn)
 
-         current_job = KybJob(gridtools.PostProcessingTaskStarter,[chunk_fn,result_fn])
+         current_job = KybJob(gridtools.PostProcessingTaskStarter,[self.settings,chunk_fn,result_fn])
          current_job.h_vmem = '15.0G'
          current_job.express = 'True'
 
@@ -305,8 +302,8 @@ class PostprocessingTask(ClusterTask):
       combine_files(self.result_files,combined_fn)
 
 
-def PostProcessingTaskStarter(chunk_fn,result_fn):
-   create_alignment_file(chunk_fn,result_fn)
+def PostProcessingTaskStarter(settings,chunk_fn,result_fn):
+   createAlignmentOutput(settings,chunk_fn,result_fn)
 
 
 
index 0bad5d9..e68d67d 100644 (file)
@@ -1,6 +1,3 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation; either version 2 of the License, or
@@ -192,6 +189,10 @@ class QPalma:
 
 
    def train(self,training_set):
+      """
+      The mainloop for training.
+      """
+
       numExamples = len(training_set)
       self.plog('Number of training examples: %d\n'% numExamples)
 
@@ -477,6 +478,7 @@ class QPalma:
 #
 ###############################################################################
 
+
    def init_prediction(self,dataset_fn,prediction_keys,param_fn,set_name):
       """
       Performing a prediction takes...
index f1211d3..3aca1f9 100644 (file)
@@ -1,6 +1,14 @@
 #!/usr/bin/env python 
 # -*- coding: utf-8 -*- 
 
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# Written (W) 2008 Fabio De Bona
+# Copyright (C) 2008 Max-Planck-Society
+
 import cPickle
 import math
 import numpy
@@ -9,7 +17,7 @@ import pdb
 import unittest
 
 from qpalma_main import QPalma
-from Utils import print_prediction
+from qpalma.utils import print_prediction
 
 from createAlignmentFileFromPrediction import alignment_reconstruct
 
@@ -30,7 +38,7 @@ class TestQPalmaPrediction(unittest.TestCase):
 
       
 
-   def setUp(self):
+   def _setUp(self):
       print
       self.prediction_set = {}
 
@@ -78,6 +86,48 @@ class TestQPalmaPrediction(unittest.TestCase):
       example = (currentSeqInfo,read,currentQualities)
       self.prediction_set[id] = [example]
 
+
+   def setUp(self):
+      self.prediction_set = {}
+
+      # chr1 +  20-120
+      read  = 'catctatgcaacagcattacagtgatcaccggcccaaaaaacctgtgtctggggttttgcctgatgatagcagtgatactgaaactggatcaatggtaag'
+      currentQualities = [[40]*len(read)]
+
+      id       = 3
+      chromo   = 1
+      strand   = '+'
+
+      genomicSeq_start  = 3500
+      genomicSeq_stop   = 6500
+      print 'Position: ',
+      print genomicSeq_start,genomicSeq_stop 
+
+      currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
+
+      example = (currentSeqInfo,read,currentQualities)
+      self.prediction_set[id] = [example]
+
+      # chr1 - 5000-5100
+      read  = 'ctgtgtatctggttgctcaatatgctcgccggaaaatgaagatcatggatgctgtgagttctccttattgttcattatcaaactgatatgagtttctgat'
+      currentQualities = [[40]*len(read)]
+
+      id       = 4
+      chromo   = 1
+      strand   = '-'
+
+      total_size = 30432563
+
+      genomicSeq_start  = total_size - 6500
+      genomicSeq_stop   = total_size - 3500
+      print 'Position: ',
+      print genomicSeq_start,genomicSeq_stop 
+
+      currentSeqInfo = id,chromo,strand,genomicSeq_start,genomicSeq_stop
+
+      example = (currentSeqInfo,read,currentQualities)
+      self.prediction_set[id] = [example]
+
    
    def testAlignments(self):
       run_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/saved_run'
@@ -99,20 +149,22 @@ class TestQPalmaPrediction(unittest.TestCase):
             print len(example)
 
       # fetch the data needed
-      g_dir    = run['dna_flat_files'] #'/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
-      acc_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
-      don_dir  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
+      settings = {}
 
-      g_fmt = 'chr%d.dna.flat'
-      s_fmt = 'contig_%d%s'
+      settings['genome_dir']           = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome'
+      settings['acceptor_scores_loc']  = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
+      settings['donor_scores_loc']     = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
 
-      num_chromo = 6
+      settings['genome_file_fmt']      = 'chr%d.dna.flat'
+      settings['splice_score_file_fmt']= 'contig_%d%s'
 
-      accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
-      seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
+      allowed_fragments = [1]
+      settings['allowed_fragments'] = allowed_fragments
+
+      accessWrapper = DataAccessWrapper(settings)
+      seqInfo = SeqSpliceInfo(accessWrapper,allowed_fragments)
 
       qp = QPalma(run,seqInfo,True)
-      #qp.init_prediction(run,set_name)
       allPredictions = qp.predict(self.prediction_set,param)
 
       for current_prediction in allPredictions:
@@ -130,10 +182,11 @@ class TestQPalmaPrediction(unittest.TestCase):
          numExons = int(math.ceil(len(predExons) / 2))
          
          print alignment_reconstruct(current_prediction,numExons)
-         #print id,start_pos,predExons
+         print id,start_pos,predExons
 
       print 'Problem counter is %d' % qp.problem_ctr 
 
+
 if __name__ == '__main__':
    suite = unittest.TestLoader().loadTestsFromTestCase(TestQPalmaPrediction)
    unittest.TextTestRunner(verbosity=2).run(suite)