+ refactored code further
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 5 Mar 2008 16:09:21 +0000 (16:09 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 5 Mar 2008 16:09:21 +0000 (16:09 +0000)
+ evaluation script and job submission works now
TODO
+ check indices for those sites found by vmatch which are arbitrary off the
original ones

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@8041 e1793c9e-67f9-0310-80fc-b846ff1f7b36

18 files changed:
dyn_prog/Makefile
dyn_prog/debug_tools.h
dyn_prog/fill_matrix.cpp
dyn_prog/qpalma_dp.h
qpalma/DataProc.py
qpalma/SIQP.py
qpalma/SIQP_CPX.py
qpalma/computeSpliceAlignWithQuality.py
qpalma/parsers.py
qpalma/set_param_palma.py
scripts/Evaluation.py
scripts/Experiment.py
scripts/ModelSelection.py
scripts/compile_dataset.py
scripts/qpalma_main.py
tools/data_tools/Makefile
tools/data_tools/filterReads.c
tools/plot_param.py

index 5b9fad6..e8b348e 100644 (file)
@@ -2,6 +2,7 @@ SRCS= Mathmatics.cpp\
                fill_matrix.cpp\
                qpalma_dp.cpp\
                result_align.cpp\
+               debug_tools.cpp\
                penalty_info.cpp\
                print_align.cpp\
                io.cpp
@@ -19,7 +20,9 @@ OBJS = $(SRCS:%.cpp=%.o)
 
 #CXXFLAGS=-O3 -fPIC
 #CXXFLAGS=-O3 -fPIC -pg -fprofile-arcs
-CXXFLAGS=-O3 -fPIC -ggdb
+
+#CXXFLAGS=-O3 -fPIC -ggdb -pg
+CXXFLAGS=-O3 -fPIC -ggdb #-fno-inline
 
 IF=QPalmaDP
 
index 7243830..4865bee 100644 (file)
@@ -3,13 +3,9 @@
 
 #include <cstdio>
 
-void fassert(bool exp,int line, char* file) {
-   if (! exp) {
-      printf("invalid fassert at line %d in file %s!\n",line,file);
-      exit(EXIT_FAILURE);
-   }
-}
+void fassert(bool exp,int line, char* file);
+
+#endif // __DEBUG_TOOLS__
 
 #define FA(expr) (fassert(expr,__LINE__,__FILE__))
 
-#endif // __DEBUG_TOOLS__
index 9ba1d86..2f87968 100644 (file)
 
 #define D_USE_QUALITY_SCORES 1
 
-void fassert(bool exp,int line, char* file);
-#define FA(expr) (fassert(expr,__LINE__,__FILE__))
-
-using namespace std;
-
 int check_char(char base)
 {
   switch(base)
index 98e908b..9940b86 100644 (file)
@@ -2,6 +2,7 @@
 #define _QPALMA_DP_H_
 
 #include "penalty_info.h"
+#include "debug_tools.h"
 
 struct ArrayElem { //24B
   double score;
@@ -32,8 +33,6 @@ typedef struct Align_pair { //24B
 void fill_matrix (int nr_paths, Pre_score* matrices[], int len_est, int len_dna, char* est, char* dna, double* prb, penalty_struct* functions, double* matchmatrix, penalty_struct* qualityScores, double* donor, double* acceptor, bool remove_duplicate_scores, int nr_donor_sites, int* donor_sites, int* max_score_positions);
 
 int check_char(char base);
-void fassert(bool exp,int line, char* file);
-#define FA(expr) (fassert(expr,__LINE__,__FILE__))
 
 bool result_align (Pre_score* matrices[], int matrixnr, int i,  int j, int* result_length_ptr, char* est, char* dna, double* prb, double* chastity,  int* s_align, int* e_align, int* m_param, double* alignmentscores, int* max_score_positions, penalty_struct* qparam, mode currentMode);
 
index 42ca2e0..c93a686 100644 (file)
@@ -4,6 +4,7 @@
 from numpy.matlib import mat,zeros,ones,inf
 import cPickle
 import pdb
+#import pydb
 import Configuration as Conf
 import tools
 from tools.PyGff import *
@@ -12,24 +13,26 @@ import scipy.io
 import sys
 
 from genome_utils import load_genomic
+from Genefinding import *
 
 def paths_load_data(filename,expt,genome_info,PAR,test=False):
 
-   data = io_pickle.load(filename)
+   #data = io_pickle.load(filename)
+   data = cPickle.load(open(filename))
 
-   #cPickle.dump(data,open(filename+'.alt','w+'))
-   #data = cPickle.load(open(filename+'.alt'))
+   Sequences   = data[0]
+   Acceptors   = data[1]
+   Donors      = data[2]
+   Exons       = data[3]
+   Ests        = data[4]
+   OriginalEsts= data[5]
+   Qualities   = data[6]
+   UpCut       = data[7]
+   StartPos    = data[8]
+   AlternativeSequences = data[9]
 
-   Sequences   = data['Sequences']
-   Acceptors   = data['Acceptors']
-   Donors      = data['Donors']
-   Exons       = data['Exons']
-   Ests        = data['Ests']
-   OriginalEsts= data['OriginalEsts']
-   Qualities   = data['Qualities']
-   SplitPositions = data['SplitPositions']
-
-   return Sequences, Acceptors, Donors, Exons, Ests, OriginalEsts, Qualities, SplitPositions
+   return Sequences, Acceptors, Donors, Exons, Ests, OriginalEsts, Qualities,\
+   UpCut, StartPos, AlternativeSequences
 
 
 def paths_load_data_solexa(expt,genome_info,PAR,test=False):
@@ -155,7 +158,6 @@ def paths_load_data_solexa(expt,genome_info,PAR,test=False):
       # now we want to use interval_query to get the predicted splice scores
       # trained on the TAIR sequence and annotation
 
-      from Genefinding import *
       
       interval_matrix = createIntArrayFromList([currentGene.start+up_cut,currentGene.start+down_cut])
       num_fields = 1
index 2f1f767..fecfb1e 100644 (file)
@@ -13,8 +13,6 @@ import cvxopt.base as cb
 import logging
 logging.basicConfig(level=logging.DEBUG,format='%(levelname)s %(message)s')
 
-import qpalma.Configuration as Configuration
-
 class SIQP:
    """
    This class is a wrapper for the cvxopt/cplex qp solvers.
@@ -34,7 +32,7 @@ class SIQP:
 
    """
 
-   def __init__(self,fSize,numExamples,c):
+   def __init__(self,fSize,numExamples,c,run):
       assert fSize > 0, 'You have to have at least one feature!'
       assert numExamples > 0, 'You have to have at least one example!'
       self.numFeatures = fSize
@@ -42,6 +40,8 @@ class SIQP:
       self.C = c
       self.EPSILON = 10e-15
 
+      self.run = run
+
       logging.debug("Starting SIQP solver with %d examples. A feature space dim. of %d.", numExamples,fSize)
       logging.debug("Regularization parameters are C=%f."%(self.C))
 
@@ -58,19 +58,19 @@ class SIQP:
       # end of zeroing regularizer
 
    def createSmoothnessRegularizer(self):
-      # set regularizer to zero
+      run = self.run
+
       self.P = cb.matrix(0.0,(self.dimP,self.dimP))
       for i in range(self.numFeatures):
          self.P[i,i] = 1.0
 
-      lengthSP    = Configuration.numLengthSuppPoints
-      donSP       = Configuration.numDonSuppPoints
-      accSP       = Configuration.numAccSuppPoints
-      mmatrixSP   = Configuration.sizeMatchmatrix[0]\
-      *Configuration.sizeMatchmatrix[1]
-      numq        = Configuration.numQualSuppPoints
-      totalQualSP = Configuration.totalQualSuppPoints
-      numQPlifs = Configuration.numQualPlifs 
+      lengthSP    = run['numLengthSuppPoints']
+      donSP       = run['numDonSuppPoints']
+      accSP       = run['numAccSuppPoints']
+      mmatrixSP   = run['matchmatrixRows']*run['matchmatrixCols']
+      numq        = run['numQualSuppPoints']
+      totalQualSP = run['totalQualSuppPoints']
+      numQPlifs = run['numQualPlifs'] 
 
       #lengthGroupParam  = 5*1e-9
       #spliceGroupParam  = 1e-9
index d6f859d..a04c77d 100644 (file)
@@ -56,8 +56,8 @@ class SIQPSolver(SIQP):
    # Define types and constants
    Inf   = CPX_INFBOUND
 
-   def __init__(self,fSize,numExamples,c,proto):
-      SIQP.__init__(self,fSize,numExamples,c)
+   def __init__(self,fSize,numExamples,c,proto,run):
+      SIQP.__init__(self,fSize,numExamples,c,run)
       self.numFeatures = fSize
       self.numExamples = numExamples
       self.numVariables = self.numFeatures + self.numExamples
@@ -88,6 +88,20 @@ class SIQPSolver(SIQP):
       self.ub = FloatMatrix([self.Inf] * self.numVariables).data
       self.lb = FloatMatrix([-self.Inf] * self.numFeatures + [0.0]*self.numExamples).data
 
+      # set intron length to zero
+      #import pdb
+      #pdb.set_trace()
+
+      #self.ub[0] = 0.0
+      #self.ub[1] = 0.0
+
+      #self.lb[0] = 0.0
+      #self.lb[1] = 0.0
+
+      #pdb.set_trace()
+
+      ############################
+
       self.matbeg = IntMatrix([1]*self.numVariables).data
       self.matbeg[0] = 0
 
index 445bfab..ae6e646 100644 (file)
@@ -5,9 +5,10 @@ import numpy
 from numpy.matlib import mat,zeros,ones,inf
 import pdb
 from Plif import Plf,base_coord,linspace
-import Configuration as Conf
 
-def  computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, qualityPlifs):
+import Helpers
+
+def  computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, qualityPlifs, run):
    """
    Exonpos: Anfang Exon bis 1 rechts neben Ende Exon
    Daraus folgt: SpliceposAnfang ist gleich Ende Exon von Vorgaenger
@@ -54,7 +55,7 @@ def  computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, quali
    # start of label feature generation
    #  
 
-   sizeMatchmatrix = Conf.sizeMatchmatrix
+   sizeMatchmatrix = run['matchmatrixRows']*run['matchmatrixCols'] #
    # counts the occurences of a,c,g,t,n in this order
 
    # counts the occurrences of a,c,g,t,n with their quality scores
@@ -64,7 +65,7 @@ def  computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, quali
 
    for row in range(5):
       for col in range(6):
-         trueWeightQualityMat[row][col] = [0.0]*Conf.numQualSuppPoints # supp. points per plif
+         trueWeightQualityMat[row][col] = [0.0]*run['numQualSuppPoints'] # supp. points per plif
 
    dna_part = []
    est_part = []
@@ -86,12 +87,12 @@ def  computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, quali
          orig_pos += 1
 
    for orig_char in dna_part:
-      assert orig_char in Conf.alphabet, pdb.set_trace()
+      assert orig_char in Helpers.alphabet, pdb.set_trace()
 
    for orig_char in est_part:
-      assert orig_char in Conf.alphabet, pdb.set_trace()
+      assert orig_char in Helpers.alphabet, pdb.set_trace()
 
-   trueWeightMatch = zeros((sizeMatchmatrix[0]*sizeMatchmatrix[1],1)) # Scorematrix fuer Wahrheit
+   trueWeightMatch = zeros((run['matchmatrixRows']*run['matchmatrixCols'],1)) # Scorematrix fuer Wahrheit
 
    map = {'-':0, 'a':1, 'c':2, 'g':3, 't':4, 'n':5}
 
@@ -136,12 +137,12 @@ def  computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, quali
             trueWeightQualityMat[estnum-1][dnanum][Upper] += weightup
             trueWeightQualityMat[estnum-1][dnanum][Lower] += weightlow
 
-      trueWeightQuality = zeros((Conf.numQualSuppPoints*Conf.numQualPlifs,1))
+      trueWeightQuality = zeros((run['numQualSuppPoints']*run['numQualPlifs'],1))
 
       ctr = 0
       for row in range(5):
          for col in range(6):
-            for p_idx in range(Conf.numQualSuppPoints):
+            for p_idx in range(run['numQualSuppPoints']):
                #print ctr, row, col, p_idx
                trueWeightQuality[ctr] = trueWeightQualityMat[row][col][p_idx]
                ctr += 1
index 95de1ad..f28892f 100644 (file)
@@ -45,6 +45,8 @@ class FilteredReadParser(ReadParser):
       splitpos = int(splitpos)
       read_size = int(read_size)
 
+      seq=seq.lower()
+
       assert strand in ['D','P']
 
       if strand == 'D':
@@ -106,13 +108,20 @@ class RemappedReadParser(ReadParser):
       We assume that a line has the following entires:
 
       """
-      id,chr,pos,strand,mismatches,align_len,offset,alignment = line.split()
-      pos = int(pos)
-      mismatches = int(mismatches)
-      align_len = int(align_len)
-      offset = int(offset)
-      line_d = {'id':id, 'chr':chr, 'pos':pos, 'strand':strand, 'mismatches':mismatches, 'align_len':align_len,\
-      'offset':offset, 'alignment':alignment}
+      id,chr1,pos1,seq1,chr2,pos2,seq2,quality = line.split()
+
+      chr1 = int(chr1)
+      chr2 = int(chr2)
+
+      seq1=seq1.lower()
+      seq2=seq2.lower()
+
+      pos1 = int(pos1)
+      pos2 = int(pos2)
+
+      line_d = {'id':id, 'chr1':chr1, 'pos1':pos1, 'seq1':seq1, 'chr2':chr2,\
+      'pos2':pos2, 'seq2':seq2, 'quality':'quality'}
+
       return line_d
 
    def next(self):
@@ -134,3 +143,71 @@ class RemappedReadParser(ReadParser):
             old_entry = entries[id]
             old_entry.append(line_d)
             entries[id] = old_entry
+
+      return entries
+
+
+class MapParser(ReadParser):
+   """
+   This class offers a parser for the reads that are remapped by the vmatch
+   utility. 
+
+   According to the docu the entries are:
+   
+   ID, Chromosome, Position, Orientation (D or P), Mismatches, Alignment length, Offset, Alignment
+
+   """
+
+   def __init__(self,filename):
+      ReadParser.__init__(self,filename)
+
+   def parseLine(self,line):
+      """
+      We assume that a line has the following entries:
+
+      """
+
+      id,chr,pos,strand,mismatches,length,offset,seq = line.split()
+
+      chr   = int(chr)
+      pos   = int(pos)
+
+      if strand == 'D':
+         strand = '+'
+
+      if strand == 'P':
+         strand = '-'
+
+      mismatches  = int(mismatches)
+      length      = int(length)
+      offset      = int(offset)
+
+      seq=seq.lower()
+
+      line_d = {'id':id, 'chr':chr, 'pos':pos, 'strand':strand,\
+      'mismatches':mismatches, 'length':length, 'offset':offset,\
+      'seq':seq}
+
+      return line_d
+
+   def next(self):
+      for line in self.fh:
+         line = line.strip()
+         yield self.parseLine(line)
+
+      raise StopIteration
+
+   def parse(self):
+      entries = {}
+
+      for elem in self.fh:
+         line_d = self.parseLine(elem)
+         id = line_d['id']
+         try:
+            entries[id] = [line_d]
+         except:
+            old_entry = entries[id]
+            old_entry.append(line_d)
+            entries[id] = old_entry
+
+      return entries
index 0508a98..17740fd 100644 (file)
@@ -5,31 +5,30 @@ import math
 import numpy.matlib
 import QPalmaDP
 import pdb
-import Configuration as Conf
 from Plif import *
 
-def set_param_palma(param, train_with_intronlengthinformation,\
-                                               min_intron_len=None, max_intron_len=None, min_svm_score=None, max_svm_score=None):
+import qpalma.Configuration as Conf
+
+def set_param_palma(param, train_with_intronlengthinformation, run):
 
    print 'Setting parameters ...'
    
-   min_intron_len = Conf.min_intron_len 
-   max_intron_len = Conf.max_intron_len
+   min_intron_len = run['min_intron_len'] 
+   max_intron_len = run['max_intron_len']
 
-   min_svm_score = Conf.min_svm_score 
-   max_svm_score = Conf.max_svm_score 
+   min_svm_score = run['min_svm_score'] 
+   max_svm_score = run['max_svm_score'] 
 
    h = Plf()
    d = Plf()
    a = Plf()
-   qualityPlifs = [None]*Conf.numQualPlifs
 
-   donSP       = Conf.numDonSuppPoints
-   accSP       = Conf.numAccSuppPoints
-   lengthSP    = Conf.numLengthSuppPoints
-   mmatrixSP   = Conf.sizeMatchmatrix[0]\
-   *Conf.sizeMatchmatrix[1]
-   totalQualSP = Conf.totalQualSuppPoints
+   qualityPlifs = [None]*run['numQualPlifs']
+   donSP       = run['numDonSuppPoints']
+   accSP       = run['numAccSuppPoints']
+   lengthSP    = run['numLengthSuppPoints']
+   mmatrixSP   = run['matchmatrixRows']*run['matchmatrixCols']
+   totalQualSP = run['totalQualSuppPoints']
 
    ####################
    # Gapfunktion
@@ -77,21 +76,29 @@ def set_param_palma(param, train_with_intronlengthinformation,\
    # Matchmatrix
    ####################
    mmatrix = numpy.matlib.mat(param[lengthSP+donSP+accSP:lengthSP+donSP+accSP+mmatrixSP])
-   mmatrix.reshape(Conf.sizeMatchmatrix[0],Conf.sizeMatchmatrix[1]) 
+
+
+
+   mmatrix.reshape(run['matchmatrixRows'],run['matchmatrixCols'])
+
+
 
    ####################
    # Quality Plifs
    ####################
-   for idx in range(Conf.numQualPlifs):
+   for idx in range(run['numQualPlifs']):
       currentPlif = Plf()
-      currentPlif.limits    = linspace(Conf.min_qual,Conf.max_qual,Conf.numQualSuppPoints) 
-      begin                 = lengthSP+donSP+accSP+mmatrixSP+(idx*Conf.numQualSuppPoints)
-      end                   = lengthSP+donSP+accSP+mmatrixSP+((idx+1)*Conf.numQualSuppPoints)
+      currentPlif.limits    = linspace(run['min_qual'],run['max_qual'],run['numQualSuppPoints']) 
+      begin                 = lengthSP+donSP+accSP+mmatrixSP+(idx*run['numQualSuppPoints'])
+      end                   = lengthSP+donSP+accSP+mmatrixSP+((idx+1)*run['numQualSuppPoints'])
+
       currentPlif.penalties = param[begin:end].flatten().tolist()[0]
       currentPlif.transform = '' 
       currentPlif.name      = 'q' 
-      currentPlif.max_len   = Conf.max_qual 
-      currentPlif.min_len   = Conf.min_qual
+
+      currentPlif.max_len   = run['max_qual'] 
+      currentPlif.min_len   = run['min_qual']
+
       qualityPlifs[idx] = currentPlif
 
    return h,d,a,mmatrix,qualityPlifs
index 5156d8f..5faf83d 100644 (file)
@@ -7,6 +7,8 @@ import pydb
 import pdb
 import os
 import os.path
+import math
+
 
 def evaluatePositions(eBegin,eEnd):
    eBegin_pos = [elem for elem in eBegin if elem == 0]
@@ -36,11 +38,10 @@ def evaluatePositions(eBegin,eEnd):
 
 
 def perform_prediction(current_dir,run_name):
-   #cmd = 'echo ./doPrediction.sh %s | qsub -l h_vmem=1.0G -cwd -j y -N \"%s.log\"'%(current_dir,run_name)
-   cmd = './doPrediction.sh %s 1>%s.out 2>%s.err' %(current_dir,run_name,run_name)
-   os.system(cmd)
+   cmd = 'echo /fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/doPrediction.sh %s | qsub -l h_vmem=12.0G -cwd -j y -N \"%s.log\"'%(current_dir,run_name)
+   #cmd = './doPrediction.sh %s 1>%s.out 2>%s.err' %(current_dir,run_name,run_name)
    #print cmd
-
+   os.system(cmd)
 
 def forall_experiments(current_func,tl_dir):
    """
@@ -85,30 +86,150 @@ def prediction_on(filename):
    allWrongExons  = []
    allDoubleScores = []
 
-   ctr = 0
+   all_pos_correct_ctr = 0
 
    for current_example_pred in allPredictions:
-      ambigous_match = False
-      if len(current_example_pred) > 1:
-         ambigous_match = True
-         example_scores = []
+      for elem_nr,current_pred in enumerate(current_example_pred[:1]):
+
+         #gt_score = gt_example[] 
+         #score = gt_example['DPScores'].flatten().tolist()[0][0]
+
+         #all_pos_correct = evaluate_example(current_pred)
+         all_pos_correct = evaluate_unmapped_example(current_pred)
+
+         if all_pos_correct:
+            all_pos_correct_ctr += 1
+
+   print 'Total num. of examples: %d' % len(allPredictions)
+   print 'Number of total correct examples: %d' % all_pos_correct_ctr
+
+   #print 'Correct positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_pos),len(e1End_pos),len(e2Begin_pos),len(e2End_pos))
+   #print 'Incorrect positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_neg),len(e1End_neg),len(e2Begin_neg),len(e2End_neg))
+   #print 'Mean of pos. offset:\t\t%.2f\t%.2f\t%.2f\t%.2f' % (mean_e1Begin_neg,mean_e1End_neg,mean_e2Begin_neg,mean_e2End_neg)
+
+def evaluate_unmapped_example(current_prediction):
+
+   predExons = current_prediction['predExons']
+   trueExons = current_prediction['trueExons']
+
+   return compare_exons(predExons,trueExons)
+
+
+def evaluate_example(current_prediction):
+   
+   label = False
+
+   try:
+      label = current_prediction['label'] 
+   except:
+      pass
+
+   #predExons = current_prediction['predExons']
+   #trueExons = current_prediction['trueExons']
+   #pdb.set_trace()
+
+   if label:
+
+      start_pos = current_prediction['start_pos']
+      alternative_start_pos = current_prediction['alternative_start_pos']
+
+      predExons = current_prediction['predExons']
+      trueExons = current_prediction['trueExons']
+
+      #pdb.set_trace()
+      
+      if start_pos <= alternative_start_pos:
+         start_offset = alternative_start_pos-start_pos
+         predExons += start_offset
+
+         return compare_exons(predExons,trueExons)
+
+      if alternative_start_pos <= start_pos:
+         start_offset = start_pos-alternative_start_pos
+         predExons += start_offset
+
+         return compare_exons(predExons,trueExons)
+
+   else:
+      pred_score = current_prediction['DPScores'].flatten().tolist()[0][0]
+
+      return False
+
+
+def compare_exons(predExons,trueExons):
+   e1_b_off,e1_e_off,e2_b_off,e2_e_off = 0,0,0,0
 
-      for elem_nr,current_pred in enumerate(current_example_pred):
-         e1_b_off = current_pred['e1_b_off']
-         e1_e_off = current_pred['e1_e_off']
-         e2_b_off = current_pred['e2_b_off']
-         e2_e_off = current_pred['e2_e_off']
+   if len(predExons) == 4:
+      e1_begin,e1_end = predExons[0],predExons[1]
+      e2_begin,e2_end = predExons[2],predExons[3]
+   else:
+      return False
 
+   e1_b_off = int(math.fabs(e1_begin - trueExons[0,0]))
+   e1_e_off = int(math.fabs(e1_end - trueExons[0,1]))
+
+   e2_b_off = int(math.fabs(e2_begin - trueExons[1,0]))
+   e2_e_off = int(math.fabs(e2_end - trueExons[1,1]))
+
+   if e1_b_off == 0 and e1_e_off == 0 and e2_b_off == 0\
+   and e2_e_off == 0:
+      return True
+
+   return False
+
+
+def evaluateExample(dna,est,exons,SpliceAlign,newEstAlign):
+   newExons = []
+   oldElem = -1
+   SpliceAlign = SpliceAlign.flatten().tolist()[0]
+   SpliceAlign.append(-1)
+   for pos,elem in enumerate(SpliceAlign):
+      if pos == 0:
+         oldElem = -1
+      else:
+         oldElem = SpliceAlign[pos-1]
+
+      if oldElem != 0 and elem == 0: # start of exon
+         newExons.append(pos)
+
+      if oldElem == 0 and elem != 0: # end of exon
+         newExons.append(pos)
+
+   e1_b_off,e1_e_off,e2_b_off,e2_e_off = 0,0,0,0
+
+   if len(newExons) == 4:
+      e1_begin,e1_end = newExons[0],newExons[1]
+      e2_begin,e2_end = newExons[2],newExons[3]
+   else:
+      return None,None,None,None,newExons
+
+   e1_b_off = int(math.fabs(e1_begin - exons[0,0]))
+   e1_e_off = int(math.fabs(e1_end - exons[0,1]))
+
+   e2_b_off = int(math.fabs(e2_begin - exons[1,0]))
+   e2_e_off = int(math.fabs(e2_end - exons[1,1]))
+
+   return e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons
+
+
+if __name__ == '__main__':
+   dir = sys.argv[1]
+   assert os.path.exists(dir), 'Error directory does not exist!'
+
+   #forall_experiments(perform_prediction,dir)
+   forall_experiments(collect_prediction,dir)
+
+"""
          if elem_nr > 0:
             #print 'start positions'
             #print current_pred['start_pos'], current_pred['alternative_start_pos']
 
             if current_pred['label'] == False or (current_pred['label'] == True
             and len(current_pred['predExons']) != 4):
-               if current_pred['DPScores'].flatten().tolist()[0][0] <\
+               if 
                current_example_pred[0]['DPScores'].flatten().tolist()[0][0]:
                   print current_pred['trueExons'][0,1]-current_pred['trueExons'][0,0],\
-                  current_pred['trueExons'][1,1]-current_pred['trueExons'][1,0],\
+                  current_pred['trueExons'][1,1] - current_pred['trueExons'][1,0],\
                   current_pred['predExons']
                   print current_pred['DPScores'].flatten().tolist()[0][0],\
                   current_example_pred[0]['DPScores'].flatten().tolist()[0][0]
@@ -132,23 +253,4 @@ def prediction_on(filename):
          e2Begin_pos,e2Begin_neg,e2End_pos,e2End_neg,mean_e2Begin_neg,mean_e2End_neg = evaluatePositions(exon2Begin,exon2End)
 
       allDoubleScores.append(example_scores)
-
-   all_pos_correct = 0
-   for idx in range(len(exon1Begin)):
-      if exon1Begin[idx] == 0 and exon1End[idx] == 0\
-      and exon2Begin[idx] == 0 and exon2End[idx] == 0:
-         all_pos_correct += 1
-
-   print 'Total num. of examples: %d' % len(allPredictions)
-   print 'Number of total correct examples: %d' % all_pos_correct
-   print 'Correct positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_pos),len(e1End_pos),len(e2Begin_pos),len(e2End_pos))
-   print 'Incorrect positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_neg),len(e1End_neg),len(e2Begin_neg),len(e2End_neg))
-   print 'Mean of pos. offset:\t\t%.2f\t%.2f\t%.2f\t%.2f' % (mean_e1Begin_neg,mean_e1End_neg,mean_e2Begin_neg,mean_e2End_neg)
-
-
-if __name__ == '__main__':
-   dir = sys.argv[1]
-   assert os.path.exists(dir), 'Error directory does not exist!'
-
-   #forall_experiments(perform_prediction,dir)
-   forall_experiments(collect_prediction,dir)
+"""
index 506a09a..ae462e6 100644 (file)
@@ -21,7 +21,7 @@ def createRuns():
    numFolds=5
 
    # the main directory where all results are stored
-   experiment_dir = '/fml/ag-raetsch/home/fabio/tmp/QPalma_test'
+   experiment_dir = '/fml/ag-raetsch/home/fabio/tmp/QPalma_2k'
 
    assert os.path.exists(experiment_dir), 'toplevel dir for experiment does not exist!'
 
@@ -34,11 +34,13 @@ def createRuns():
 
    allRuns = []
 
-   dataset_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/dataset_remapped_test'
+   #dataset_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/dataset_remapped_test'
+   dataset_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/dataset_remapped_test_2k'
 
    for QFlag in [True,False]:
       for SSFlag in [True,False]:
-         for ILFlag in [True,False]:
+         for ILFlag in [True]:
+         #for ILFlag in [True,False]:
             
             # create a new Run object
             currentRun = Run()
@@ -55,7 +57,7 @@ def createRuns():
             currentRun['print_matrix']          = Conf.print_matrix
             currentRun['read_size']             = Conf.read_size
 
-            currentRun['numLengthSuppPoints']   = 2 #Conf.numLengthSuppPoints
+            currentRun['numLengthSuppPoints']   = 10 #Conf.numLengthSuppPoints
             currentRun['numDonSuppPoints']      = 10
             currentRun['numAccSuppPoints']      = 10
 
index 03d0704..70b5d99 100644 (file)
@@ -35,7 +35,7 @@ class Model:
 
    def doSelection(self):
       for instance in self.allInstances:
-         time.sleep(3)
+         time.sleep(2)
          os.system('echo ./resurrect %s | qsub -l h_vmem=3.0G -cwd -j y -N \"%s.log\"'%(instance,instance))
          #os.system('echo "./resurrect %s >out_%s.log 2>err_%s.log &"'%(instance,instance,instance))
 
index 008eb6b..57467f9 100644 (file)
@@ -6,7 +6,7 @@ import random
 import os
 import re
 import pdb
-import pydb
+#import pydb
 import io_pickle
 
 import numpy
@@ -183,7 +183,7 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
          if instance_counter % 1000 == 0:
             print 'processed %d examples' % instance_counter
 
-         if instance_counter == 2000:
+         if instance_counter == 20000:
             break
 
    print 'Full dataset has size %d' % len(Sequences)
@@ -201,7 +201,10 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
    #'Ids':Ids, 'FReads':FReads}
 
    # saving new dataset
-   io_pickle.save(dataset_file,dataset)
+   #io_pickle.save(dataset_file,dataset)
+
+   #io_pickle.save(dataset_file,dataset)
+   cPickle.dump(dataset,open(dataset_file,'w+'),protocol=2)
 
 
 def process_filtered_read(fRead,currentGene,dna_flat_files,test):
@@ -473,8 +476,10 @@ def getSpliceScores(chr,strand,intervalBegin,intervalEnd,currentDNASeq,seq_pos_o
 
    # now fetch acceptor and donor scores:
 
-   sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/acc/output_SVMWD/pred/contig_%d%s'\
-   % (chr,strand)
+   #sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/acc/output_SVMWD/pred/contig_%d%s'\
+   #% (chr,strand)
+
+   sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s' % (chr,strand)
 
    # fetch acceptor scores
    result = interval_query(sscore_filename, ['Conf'], 1, interval_matrix, 1,pos_size)
@@ -497,8 +502,10 @@ def getSpliceScores(chr,strand,intervalBegin,intervalEnd,currentDNASeq,seq_pos_o
    acc = acc[1:] + [-inf]
 
    # fetch donor scores
-   sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_%d%s'\
-   % (chr,strand)
+   #sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_%d%s'\
+   #% (chr,strand)
+
+   sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s' % (chr,strand)
 
    result = interval_query(sscore_filename, ['Conf'], 1, interval_matrix, 1,pos_size)
 
@@ -597,13 +604,13 @@ def process_map(reReads,fRead,dna_flat_files):
       # middle part of the read matches somewhere
       # search up/downstream for the rest
       else:
-         genomicSeq_start = pos
-         genomicSeq_stop = pos + fragment_size
+         genomicSeq_start = pos - (fragment_size/2)
+         genomicSeq_stop = pos + (fragment_size/2)
          alternativeSeq.append( (chr,strand,genomicSeq_start,genomicSeq_stop,currentLabel) )
          
-         genomicSeq_start = pos - fragment_size
-         genomicSeq_stop = pos+length
-         alternativeSeq.append( (chr,strand,genomicSeq_start,genomicSeq_stop,currentLabel) )
+         #genomicSeq_start = pos - fragment_size
+         #genomicSeq_stop = pos+36
+         #alternativeSeq.append( (chr,strand,genomicSeq_start,genomicSeq_stop,currentLabel) )
          continue
          
    return alternativeSeq
index 46ca884..e52984e 100644 (file)
@@ -21,7 +21,7 @@ import scipy.io
 import pdb
 import re
 import os.path
-import pydb
+#import pydb
 
 from compile_dataset import getSpliceScores
 
@@ -68,15 +68,6 @@ class QPalma:
       else:
          assert(False)
 
-      full_working_path = os.path.join(run['experiment_path'],run['name'])
-      #assert not os.path.exists(full_working_path)
-      #os.mkdir(full_working_path)
-      assert os.path.exists(full_working_path)
-
-      # ATTENTION: Changing working directory
-      os.chdir(full_working_path)
-
-      cPickle.dump(run,open('run_object.pickle','w+'))
 
    def plog(self,string):
       self.logfh.write(string)
@@ -180,6 +171,18 @@ class QPalma:
    def train(self):
       run = self.run
 
+      full_working_path = os.path.join(run['experiment_path'],run['name'])
+
+      assert not os.path.exists(full_working_path)
+      os.mkdir(full_working_path)
+
+      assert os.path.exists(full_working_path)
+
+      # ATTENTION: Changing working directory
+      os.chdir(full_working_path)
+
+      cPickle.dump(run,open('run_object.pickle','w+'))
+
       self.logfh = open('_qpalma_train.log','w+')
 
       self.plog("Settings are:\n")
@@ -483,6 +486,7 @@ class QPalma:
 
                   _newSpliceAlign = newSpliceAlign[0].flatten().tolist()[0]
                   _newEstAlign = newEstAlign[0].flatten().tolist()[0]
+
                   line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
                   self.plog(line1+'\n')
                   self.plog(line2+'\n')
@@ -590,15 +594,16 @@ class QPalma:
       self.logfh = open('_qpalma_predict.log','w+')
 
       # predict on training set
-      print '##### Prediction on the training set #####'
+      self.plog('##### Prediction on the training set #####\n')
+
       self.predict(param_filename,0,beg,'TRAIN')
       
       # predict on test set
-      print '##### Prediction on the test set #####'
+      self.plog('##### Prediction on the test set #####\n')
       self.predict(param_filename,beg,end,'TEST')
    
+      self.plog('##### Finished prediction #####\n')
       self.logfh.close()
-      #pdb.set_trace()
 
    def predict(self,param_filename,beg,end,set_flag):
       """
@@ -629,8 +634,6 @@ class QPalma:
 
       AlternativeSequences = self.AlternativeSequences[beg:end]
 
-      print 'STARTING PREDICTION'
-
       # number of training instances
       N = numExamples = len(Sequences) 
       assert len(Exons) == N and len(Ests) == N\
@@ -670,6 +673,8 @@ class QPalma:
 
       # beginning of the prediction loop
       for exampleIdx in range(numExamples):
+         self.plog('Loading example nr. %d...\n'%exampleIdx)
+
          dna = Sequences[exampleIdx] 
          est = Ests[exampleIdx] 
 
@@ -731,7 +736,8 @@ class QPalma:
          # first make a prediction on the dna fragment which comes from the ground truth                  
          current_prediction = self.calc_alignment(dna, est, exons, quality, don_supp, acc_supp, d, a, h, mmatrix, qualityPlifs)
          current_prediction['exampleIdx'] = exampleIdx
-         current_prediction['start_pos'] = current_start_pos
+         current_prediction['start_pos']  = current_start_pos
+         current_prediction['label'] = True
 
          current_example_predictions.append(current_prediction)
 
@@ -741,6 +747,15 @@ class QPalma:
             chr, strand, genomicSeq_start, genomicSeq_stop, currentLabel = alternative_alignment
             currentDNASeq, currentAcc, currentDon = get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,run['dna_flat_files'])
 
+            if not run['enable_splice_signals']:
+               for idx,elem in enumerate(currentDon):
+                  if elem != -inf:
+                     currentDon[idx] = 0.0
+
+               for idx,elem in enumerate(currentAcc):
+                  if elem != -inf:
+                     currentAcc[idx] = 0.0
+
             current_prediction = self.calc_alignment(currentDNASeq, est, exons,\
             quality, currentDon, currentAcc, d, a, h, mmatrix, qualityPlifs)
             current_prediction['exampleIdx'] = exampleIdx
@@ -801,23 +816,22 @@ class QPalma:
 
       _newSpliceAlign = newSpliceAlign.flatten().tolist()[0]
       _newEstAlign = newEstAlign.flatten().tolist()[0]
+       
+      if False:
+         line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
+         self.plog(line1+'\n')
+         self.plog(line2+'\n')
+         self.plog(line3+'\n')
 
-      line1,line2,line3 = pprint_alignment(_newSpliceAlign,_newEstAlign, dna_array, est_array)
-      self.plog(line1+'\n')
-      self.plog(line2+'\n')
-      self.plog(line3+'\n')
-
-      e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons = self.evaluateExample(dna,est,exons,newSpliceAlign,newEstAlign)
+      newExons = self.calculatePredictedExons(newSpliceAlign)
 
-      current_prediction = {'e1_b_off':e1_b_off,'e1_e_off':e1_e_off,\
-      'e2_b_off':e2_b_off ,'e2_e_off':e2_e_off,\
-      'predExons':newExons, 'trueExons':exons,\
-      'dna':dna, 'est':est, 'DPScores':newDPScores }
+      current_prediction = {'predExons':newExons, 'trueExons':exons,\
+      'dna':dna, 'est':est, 'DPScores':newDPScores}
 
       return current_prediction
 
 
-   def evaluateExample(self,dna,est,exons,SpliceAlign,newEstAlign):
+   def calculatePredictedExons(self,SpliceAlign):
       newExons = []
       oldElem = -1
       SpliceAlign = SpliceAlign.flatten().tolist()[0]
@@ -834,25 +848,7 @@ class QPalma:
          if oldElem == 0 and elem != 0: # end of exon
             newExons.append(pos)
 
-      e1_b_off,e1_e_off,e2_b_off,e2_e_off = 0,0,0,0
-
-      #self.plog(("Example %d"%exampleIdx) + str(exons) + " "+  str(newExons) + "\n")
-      #self.plog(("SpliceAlign " + str(SpliceAlign)+ "\n"))
-      self.plog(("read: " + str(est)+ "\n"))
-
-      if len(newExons) == 4:
-         e1_begin,e1_end = newExons[0],newExons[1]
-         e2_begin,e2_end = newExons[2],newExons[3]
-      else:
-         return None,None,None,None,newExons
-
-      e1_b_off = int(math.fabs(e1_begin - exons[0,0]))
-      e1_e_off = int(math.fabs(e1_end - exons[0,1]))
-
-      e2_b_off = int(math.fabs(e2_begin - exons[1,0]))
-      e2_e_off = int(math.fabs(e2_end - exons[1,1]))
-
-      return e1_b_off,e1_e_off,e2_b_off,e2_e_off,newExons
+      return newExons
 
 
 def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files):
index d4abd56..f950d55 100644 (file)
@@ -10,5 +10,8 @@ OBJS = $(SRCS:%.cpp=%.o)
 all: $(OBJS)
        @ $(CC) $(CFLAGS) -o filterReads $(OBJS) filterReads.c
 
+test:
+       @ ./filterReads subset.gff subset.reads output 1
+
 clean:
        @ rm -rf *.o
index 17420b8..d9ae253 100644 (file)
@@ -59,6 +59,7 @@ unsigned long read_nr = 1;
 
 int combined_reads = 0;
 
+
 int main(int argc, char* argv[]) {
 
    if(argc != 5) {
@@ -81,7 +82,6 @@ int main(int argc, char* argv[]) {
    FILE *reads_fs = fopen(reads_filename,"r");
    FILE *out_fs = fopen(output_filename,"w");
 
-   //read_nr = atoi(argv[4]);
    read_nr = strtoul(argv[4],NULL,10);
    read_nr++;
 
@@ -482,15 +482,13 @@ int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FI
    if(up_range+down_range < read_size)
       return 0;
 
-   int half_read = read_size / 2;
-
    // both overlap more than a half of the new read
-   if(up_range >= half_read &&  down_range >= half_read) {
+   //if(up_range >= half_read &&  down_range >= half_read) {
 
-      u_size = half_read;
-      d_size = half_read;
+   //   u_size = half_read;
+   //   d_size = half_read;
 
-   } else {
+   //} else {
 
       //if(up_range < down_range) {
       //   u_size = up_range;
@@ -502,16 +500,18 @@ int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FI
       //   u_size = read_size - d_size;
       //}
       
-      if(up_range < down_range) {
+      //if(up_range < down_range) {
+      if (read_nr % 2 != 0) {
          d_size = down_range;
          u_size = read_size - d_size;
       }
 
-      if(up_range > down_range) {
+      //if(up_range >= down_range) {
+      if (read_nr % 2 == 0) {
          u_size = up_range;
          d_size = read_size - u_size;
       }
-   }
+   //}
 
    int p_start  = exon_stop - u_size + 1;
    int p_stop   = exon_start + d_size - 1;
@@ -604,8 +604,8 @@ int join_reads(int exon_stop, int exon_start, Read* up_read, Read* down_read, FI
    //fprintf(out_fs,"%d\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%s\t%s\n",
    //   read_nr,up_read->chr,up_read->strand,new_seq,cut_pos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop,up_read->seq,down_read->seq);
 
-   fprintf(out_fs,"%lu\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n",
-      read_nr,up_read->chr,up_read->strand,new_seq,cut_pos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop);
+   fprintf(out_fs,"%lu\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\n",
+      read_nr,up_read->chr,up_read->strand,new_seq,cut_pos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop,u_size);
 
    read_nr++;
    combined_reads++;
index 90bf300..6a43fd5 100644 (file)
@@ -26,7 +26,8 @@ def plot_param(filename,train_with_intronlengthinformation=1):
     """
     assert(Configuration.mode == 'using_quality_scores')
     param = cPickle.load(open(filename))
-    [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,train_with_intronlengthinformation)
+    [h,d,a,mmatrix,qualityPlifs] =\
+    set_param_palma(param,train_with_intronlengthinformation,None)
 
     # construct the substitution matrix using the last value of qualityPlif
     substmat = numpy.matlib.zeros((1,(Configuration.estPlifs+1)*Configuration.dnaPlifs))
@@ -52,6 +53,9 @@ def plot_param(filename,train_with_intronlengthinformation=1):
     pylab.yticks( numpy.arange(5.5,0,-1), ('-', 'A', 'C', 'G', 'T', 'N') )
     pylab.xlabel('DNA',fontsize=20)
     pylab.ylabel('EST',fontsize=20)
+
+    pylab.savefig('matrix.eps')
+    pylab.clf()
     
     # plot donor and acceptor transformations
     pylab.figure()
@@ -61,23 +65,33 @@ def plot_param(filename,train_with_intronlengthinformation=1):
     pylab.ylabel('transition score',fontsize=20)
     pylab.legend()
 
+    pylab.savefig('donacc.eps')
+    pylab.clf()
+
     # plot intron length transformations
     pylab.figure()
     pylab.plot(h.limits,h.penalties,'k+-')
     pylab.xlabel('intron length',fontsize=20)
     pylab.ylabel('transition score',fontsize=20)
 
+    pylab.savefig('intron_len.eps')
+    pylab.clf()
+
     # plot quality score transformations
     pylab.figure()
     for pos,plif in enumerate(qualityPlifs):
-      #if pos in [1,8,15,22]:
-      #   pylab.plot(plif.limits,plif.penalties,'b^-',hold=True)
+      if pos in [1,8,15,22,7,13,19]:
+      #if pos in [3,9,21]:
+         pylab.plot(plif.limits,plif.penalties,'b^-',hold=True)
 
-      pylab.plot(plif.limits,plif.penalties,'b^-',hold=True)
+      #pylab.plot(plif.limits,plif.penalties,'b^-',hold=True)
     pylab.xlabel('quality value',fontsize=20)
     pylab.ylabel('transition score',fontsize=20)
 
-    pylab.show()
+    pylab.savefig('quality.eps')
+    pylab.clf()
+
+    #pylab.show()
     
 
 if __name__ == '__main__':