+ added exons read boundary output to filterReads output
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 25 Jan 2008 14:33:07 +0000 (14:33 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 25 Jan 2008 14:33:07 +0000 (14:33 +0000)
+ minor modifications of data processing tools to allow for train/predict mode

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7580 e1793c9e-67f9-0310-80fc-b846ff1f7b36

qpalma/Configuration.py
qpalma/DataProc.py
qpalma/computeSpliceWeights.py
qpalma/tools/splicesites.py
scripts/qpalma_predict.py
scripts/qpalma_train.py
tools/data_tools/filterReads.c

index 4334ef9..36db5df 100644 (file)
@@ -218,7 +218,7 @@ mode = 'using_quality_scores'
 numDonSuppPoints     = 30
 numAccSuppPoints     = 30
 numLengthSuppPoints  = 30 
-numQualSuppPoints    = 16
+numQualSuppPoints    = 4
 
 min_qual = -1
 max_qual = 40
index 18efe36..33b2995 100644 (file)
@@ -10,8 +10,7 @@ import io_pickle
 import scipy.io
 import pdb
 
-
-def paths_load_data_solexa(expt,genome_info,PAR):
+def paths_load_data_solexa(expt,genome_info,PAR,test=False):
    # expt can be 'training','validation' or 'test'
    assert expt in ['training','validation','test']
 
@@ -32,7 +31,7 @@ def paths_load_data_solexa(expt,genome_info,PAR):
 
    for line in open(est_filename):
       line = line.strip()
-      chr,strand,seq,splitpos,length,prb,cal,chastity,gene_id,exon_idx = line.split()
+      chr,strand,seq,splitpos,length,prb,cal,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
       splitpos = int(splitpos)
       length = int(length)
       prb = [ord(elem)-50 for elem in prb]
@@ -48,59 +47,63 @@ def paths_load_data_solexa(expt,genome_info,PAR):
          currentSeq = tair7_seq[gene_id+'.1']['sequence'].lower()
       except:
          continue
+
+      if len(currentSeq) < (currentGene.stop - currentGene.start):
+         continue
          
-      #assert currentSize == len(Sequences[-1]), 'gene_size %d / seq size %d' % (currentSize,len(Sequences[-1]))
-      #Acceptors.append([-inf]*currentSize)
-      #Donors.append([-inf]*currentSize)
+      oldSeq = currentSeq
 
-      exon_idx = int(exon_idx)
+      p_start = int(p_start)
+      exon_stop = int(exon_stop)
+      exon_start = int(exon_start)
+      p_stop = int(p_stop)
+
+      assert p_start <= exon_stop <= exon_start <= p_stop, 'Invalid Indices'
+      assert p_stop - p_start >= 36
 
-      assert exon_idx > 0
-      #print "%d " % exon_idx,
       currentExons = zeros((2,2))
-      #for idx in range(len(currentGene.exons)):
-      #   currentExons[idx,0] = currentGene.exons[idx][0]-currentGene.start
-      #   currentExons[idx,1] = currentGene.exons[idx][1]-currentGene.start
 
-      try:
-         currentExons[0,0] = currentGene.exons[exon_idx-1][0]-currentGene.start
-         currentExons[0,1] = currentGene.exons[exon_idx-1][1]-currentGene.start
+      currentExons[0,0] = p_start-currentGene.start
+      currentExons[0,1] = exon_stop-currentGene.start
 
-         currentExons[1,0] = currentGene.exons[exon_idx][0]-currentGene.start
-         currentExons[1,1] = currentGene.exons[exon_idx][1]-currentGene.start
+      currentExons[1,0] = exon_start-currentGene.start
+      currentExons[1,1] = p_stop-currentGene.start
 
-         cut_offset = 500
+      import copy
+      oldExons = copy.deepcopy(currentExons)
+         
+      up_cut   = int(currentExons[0,0])
+      down_cut = int(currentExons[1,1])
 
-         up_cut   = int(currentExons[0,0]) - cut_offset
-         down_cut = int(currentExons[1,1]) + cut_offset
+      if test:
+         up_cut = up_cut-500
          if up_cut < 0:
             up_cut = 0
 
+         down_cut = down_cut+500
+
          if down_cut > len(currentSeq):
             down_cut = len(currentSeq)
 
-         assert down_cut - up_cut > 0
-         assert down_cut - up_cut <= len(currentSeq)
+      currentSeq = currentSeq[up_cut:down_cut]
 
-         currentSeq = currentSeq[up_cut:down_cut]
-
-         #pdb.set_trace()
+      currentExons[0,1] += 1
+      #currentExons[1,0] += 1
 
-         if up_cut > 0:
-            currentExons[0,0] -= up_cut 
-            currentExons[0,1] -= up_cut
+      if test:
+         currentExons -= up_cut
+      else:
+         currentExons -= currentExons[0,0]
 
-            currentExons[1,0] -= up_cut
-            currentExons[1,1] -= up_cut
-            
-      except:
-         continue
-         #pdb.set_trace()
+      try:
+         if not (currentSeq[int(currentExons[0,1])] == 'g' and currentSeq[int(currentExons[0,1])+1] == 't'):
+            continue
 
-      #pdb.set_trace()
+         if not (currentSeq[int(currentExons[1,0])-1] == 'g' and currentSeq[int(currentExons[1,0])-2] == 'a'):
+            continue
 
-      currentExons[0,1] += 1
-      #currentExons[1,0] += 1
+      except:
+         pdb.set_trace()
 
       Sequences.append(currentSeq)
 
@@ -114,14 +117,14 @@ def paths_load_data_solexa(expt,genome_info,PAR):
 
       SplitPositions.append(splitpos)
 
-      if len(Sequences[-1]) == 2755:
-         Sequences = Sequences[:-1]
-         Acceptors = Acceptors[:-1]
-         Donors    = Donors[:-1]
-         Exons     = Exons[:-1]
-         Ests      = Ests[:-1]
-         Qualities = Qualities[:-1]
-         SplitPositions = SplitPositions[:-1]
+      #if len(Sequences[-1]) == 2755:
+      #   Sequences = Sequences[:-1]
+      #   Acceptors = Acceptors[:-1]
+      #   Donors    = Donors[:-1]
+      #   Exons     = Exons[:-1]
+      #   Ests      = Ests[:-1]
+      #   Qualities = Qualities[:-1]
+      #   SplitPositions = SplitPositions[:-1]
    
    print 'found %d examples' % len(Sequences)
 
index 1bfb21e..14b300c 100644 (file)
@@ -46,7 +46,7 @@ def calculateWeights(plf, scores):
 
    return currentWeight
 
-def computeSpliceWeights(d, a, h, SpliceAlign, don_supp, acc_supp,dec=False):
+def computeSpliceWeights(d, a, h, SpliceAlign, don_supp, acc_supp):
    ####################################################################################
    # 1. Donor: In don_supp stehen Werte der SVM., in SpliceAlign die Einsen
    ####################################################################################
index ac64a9d..e20e9dd 100644 (file)
@@ -380,8 +380,8 @@ def getDonAccScores(Sequences):
         acc_score = [acc_score[0]] + acc_score
         acc_score = acc_score[:-1]
         Acceptors.append(acc_score)
-        print str((max(don_score),min(numpy.where(numpy.isinf(don_score),100,don_score))))
-        print str((max(acc_score),min(numpy.where(numpy.isinf(acc_score),100,acc_score))))
+        #print str((max(don_score),min(numpy.where(numpy.isinf(don_score),100,don_score))))
+        #print str((max(acc_score),min(numpy.where(numpy.isinf(acc_score),100,acc_score))))
 
     #pdb.set_trace()
 
index 241410d..b5cbaa2 100644 (file)
@@ -35,6 +35,8 @@ import qpalma.Configuration
 from qpalma.Plif import Plf
 from qpalma.Helpers import *
 
+from qpalma.tools.splicesites import getDonAccScores
+
 def getQualityFeatureCounts(qualityPlifs):
    weightQuality = qualityPlifs[0].penalties
    for currentPlif in qualityPlifs[1:]:
@@ -70,18 +72,26 @@ class QPalma:
          use_quality_scores = False
 
       elif Conf.mode == 'using_quality_scores':
-         #Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data_solexa('training',None,self.ARGS)
-         #end = 50
-         #Sequences   = Sequences[:end]
-         #Acceptors   = Acceptors[:end]
-         #Donors      = Donors[:end]
-         #Exons       = Exons[:end]
-         #Ests        = Ests[:end]
-         #Qualities   = Qualities[:end]
-         #SplitPos    = SplitPos[:end]
+         Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos =\
+         paths_load_data_solexa('training',None,self.ARGS,True)
+
+         begin = 200
+         end = 400
+         Sequences   = Sequences[begin:end]
+         Exons       = Exons[begin:end]
+         Ests        = Ests[begin:end]
+         Qualities   = Qualities[begin:end]
+         Acceptors   = Acceptors[begin:end]
+         Donors      = Donors[begin:end]
+
+         pdb.set_trace()
+
+         Donors, Acceptors = getDonAccScores(Sequences)
+
+
+         #Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
+         #SplitPos = [1]*len(Qualities)
 
-         Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
-         SplitPos = [1]*len(Qualities)
          #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
          #pdb.set_trace()
          #Qualities = []
@@ -105,7 +115,7 @@ class QPalma:
 
       # Initialize parameter vector  / param = numpy.matlib.rand(126,1)
       #param_filename = '/fml/ag-raetsch/home/fabio/svn/projects/QPalma/python/elegans.param'
-      param_filename='/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/param_30.pickle'
+      param_filename='/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/param_200.pickle'
       param = load_param(param_filename)
 
       # Set the parameters such as limits penalties for the Plifs
@@ -401,26 +411,27 @@ def evaluateExample(dna,est,exons,SpliceAlign,newEstAlign,spos):
       else:
          oldElem = SpliceAlign[pos-1]
 
-      if oldElem == 2 and elem == 0: # start of exon
+      if oldElem != 0 and elem == 0: # start of exon
          newExons.append(pos-1)
 
-      if oldElem == 0 and elem == 1: # end of exon
+      if oldElem == 0 and elem != 0: # end of exon
          newExons.append(pos)
 
-   pdb.set_trace()
-   #up_off   = -1
-   #down_off = -1
+   up_off   = -1
+   down_off = -1
 
-   #if len(newExons) != 4:
-   #   acc = 0.0
-   #else:
-   #   e1_begin,e1_end = newExons[0],newExons[1]
-   #   e2_begin,e2_end = newExons[2],newExons[3]
+   if len(newExons) != 4:
+      acc = 0.0
+   else:
+      e1_begin,e1_end = newExons[0],newExons[1]
+      e2_begin,e2_end = newExons[2],newExons[3]
 
-   #   up_off   = int(math.fabs(e1_end - exons[0,1]))
-   #   down_off = int(math.fabs(e2_begin - exons[1,0]))
+      up_off   = int(math.fabs(e1_end - exons[0,1]))
+      down_off = int(math.fabs(e2_begin - exons[1,0]))
+
+   pdb.set_trace()
 
-   #return up_off,down_off
+   return up_off,down_off
 
 if __name__ == '__main__':
    qpalma = QPalma()
index 244eb7f..1eac5c3 100644 (file)
@@ -74,19 +74,41 @@ class QPalma:
 
          use_quality_scores = False
       elif Configuration.mode == 'using_quality_scores':
-         #Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data_solexa('training',None,self.ARGS)
 
-         #end = 50
-         #Sequences   = Sequences[:end]
-         #Exons       = Exons[:end]
-         #Ests        = Ests[:end]
-         #Qualities   = Qualities[:end]
-         #SplitPos    = SplitPos[:end]
-
-         #Donors, Acceptors = getDonAccScores(Sequences)
-
-         Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
 
+         filename = 'real_dset_%s'% 'recent'
+         if True:# not os.path.exists(filename):
+            Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data_solexa('training',None,self.ARGS)
+
+            end = 200
+            Sequences   = Sequences[:end]
+            Exons       = Exons[:end]
+            Ests        = Ests[:end]
+            Qualities   = Qualities[:end]
+            Acceptors   = Acceptors[:end]
+            Donors      = Donors[:end]
+
+            pdb.set_trace()
+
+            Donors, Acceptors = getDonAccScores(Sequences)
+            #dset = Dataset()
+            #dset.Sequences = Sequences
+            #dset.Acceptor  = Acceptors
+            #dset.Donors    = Donors
+            #dset.Exons     = Exons
+            #dset.Ests      = Ests
+            #dset.Qualities = Qualities
+            #cPickle.dump(dset,open(filename,'w+'))
+         else:
+            dset = cPickle.load(open(filename))
+            Sequences = dset.Sequences
+            Acceptors = dset.Acceptors
+            Donors = dset.Donors
+            Exons =  dset.Exons
+            Ests = dset.Ests
+            Qualities = dset.Qualities
+
+         #Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
          #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
          #Qualities = []
          #for i in range(len(Ests)):
@@ -175,8 +197,8 @@ class QPalma:
             don_supp = Donors[exampleIdx] 
             acc_supp = Acceptors[exampleIdx] 
 
-            if exons[-1,1] > len(dna):
-               continue
+            #if exons[-1,1] > len(dna):
+            #   continue
 
             #pdb.set_trace()
             # Berechne die Parameter des wirklichen Alignments (but with untrained d,a,h ...)    
@@ -324,9 +346,9 @@ class QPalma:
 
             for pathNr in range(num_path[exampleIdx]):
                #print 'decodedWeights' 
-               weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a,
-               h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp,
-               acc_supp,True)
+               weightDon, weightAcc, weightIntron = computeSpliceWeights(d, a,\
+               h, newSpliceAlign[pathNr,:].flatten().tolist()[0], don_supp,\
+               acc_supp)
 
                decodedQualityFeatures = zeros((Configuration.totalQualSuppPoints,1))
                for qidx in range(Configuration.totalQualSuppPoints):
index ccefb19..2dc7229 100644 (file)
@@ -352,6 +352,7 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
    int up_idx, down_idx, status;
    char* upstream_line = malloc(sizeof(char)*256);
    char* downstream_line = malloc(sizeof(char)*256);
+   int p_start, p_stop;
 
    int buffer_size= 64;
 
@@ -422,7 +423,7 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
          new_chastity[0] = '\0';
          int splitpos = 0;
          
-         fit = fitting(up_prb+(36-overlap),down_prb);
+         fit = fitting(up_prb+(36-overlap),down_prb+(exon_start-down_pos));
          if (fit == -1)
             continue;
 
@@ -433,18 +434,32 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
          new_strand  = up_strand;
          splitpos = overlap;
 
+         p_start = up_pos;
+         p_stop =  exon_start+(read_size-overlap);
+
          strncat(new_seq,new_up_seq,overlap);
          strncat(new_prb,up_prb,overlap);
          strncat(new_cal_prb,up_cal_prb,overlap);
          strncat(new_chastity,up_chastity,overlap);
 
-         strncat(new_seq,new_down_seq+fit,read_size-overlap);
-         strncat(new_prb,down_prb+fit,read_size-overlap);
-         strncat(new_cal_prb,down_cal_prb+fit,read_size-overlap);
-         strncat(new_chastity,down_chastity+fit,read_size-overlap);
-
-         fprintf(out_fs,"%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\n",
-         new_chr,new_strand,new_seq,splitpos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,exon_idx);
+         //strncat(new_seq,new_down_seq+fit,read_size-overlap);
+         //strncat(new_prb,down_prb+fit,read_size-overlap);
+         //strncat(new_cal_prb,down_cal_prb+fit,read_size-overlap);
+         //strncat(new_chastity,down_chastity+fit,read_size-overlap);
+         
+         int offset = exon_start-down_pos;
+         strncat(new_seq,new_down_seq+offset,read_size-overlap);
+         strncat(new_prb,down_prb+offset,read_size-overlap);
+         strncat(new_cal_prb,down_cal_prb+offset,read_size-overlap);
+         strncat(new_chastity,down_chastity+offset,read_size-overlap);
+
+         // The four last integers specify the positions of 
+         // p_start : the position in the dna where the first read starts
+         // exons_stop  : the position in the dna where the first exons ends
+         // exons_start : the position in the dna where the second exons starts
+         // p_stop  : the position in the dna where the (truncated) second read ends
+         fprintf(out_fs,"%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n",
+         new_chr,new_strand,new_seq,splitpos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop);
 
          combined_reads++;
          used_flag[down_idx] = 1;
@@ -453,6 +468,7 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
 
    free(upstream_line);
    free(downstream_line);
+   free(used_flag);
   
    free(new_up_seq);
    free(new_down_seq); 
@@ -496,8 +512,8 @@ int fitting(char* up_prb, char* down_prb) {
       didx = uidx+w_size;
       current_mean_up += up_prb[uidx]-50;
       current_mean_down += up_prb[didx]-50;
-
    }
+
    current_mean_up /= w_size;
    current_mean_down /= w_size;
 
@@ -505,8 +521,8 @@ int fitting(char* up_prb, char* down_prb) {
       didx = uidx+w_size;
       current_variance_up += pow(up_prb[uidx]-50 - current_mean_up,2);
       current_variance_down += pow(up_prb[didx]-50 - current_mean_up,2);
-      
    }
+
    current_variance_up /= (w_size-1);
    current_variance_down /= (w_size-1);