+ restored functionality
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 23 Jan 2008 11:39:31 +0000 (11:39 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 23 Jan 2008 11:39:31 +0000 (11:39 +0000)
+ exon_idx bug repaired (+1 offset for all was incorrect)
+ rearranged python code

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7543 e1793c9e-67f9-0310-80fc-b846ff1f7b36

qpalma/DataProc.py
qpalma/paths_load_data.py [deleted file]
qpalma/paths_load_data_pickle.py [deleted file]
qpalma/paths_load_data_solexa.py [deleted file]
scripts/qpalma_predict.py
scripts/qpalma_train.py
tools/data_tools/filterReads.c

index f5af3f5..71aa4d5 100644 (file)
@@ -5,7 +5,7 @@ from numpy.matlib import mat,zeros,ones,inf
 import cPickle
 import pdb
 import Configuration as Conf
-from tools.PyGff import *
+from PyGff import *
 import io_pickle
 import scipy.io
 import pdb
@@ -28,6 +28,7 @@ def paths_load_data_solexa(expt,genome_info,PAR):
    Exons = []
    Ests = []
    Qualities = []
+   SplitPositions = []
 
    for line in open(est_filename):
       line = line.strip()
@@ -52,29 +53,34 @@ def paths_load_data_solexa(expt,genome_info,PAR):
       #Acceptors.append([-inf]*currentSize)
       #Donors.append([-inf]*currentSize)
 
-      exon_idx = int(exon_idx)
-      currentExons = zeros((len(currentGene.exons),2))
+      exon_idx = int(exon_idx)-1
+      currentExons = zeros((2,2))
       #for idx in range(len(currentGene.exons)):
       #   currentExons[idx,0] = currentGene.exons[idx][0]-currentGene.start
       #   currentExons[idx,1] = currentGene.exons[idx][1]-currentGene.start
 
-      currentExons[0,0] = currentGene.exons[exon_idx-1][0]-currentGene.start
-      currentExons[0,1] = currentGene.exons[exon_idx-1][1]-currentGene.start
+      try:
+         currentExons[0,0] = currentGene.exons[exon_idx-1][0]-currentGene.start
+         currentExons[0,1] = currentGene.exons[exon_idx-1][1]-currentGene.start
+
+         currentExons[1,0] = currentGene.exons[exon_idx][0]-currentGene.start
+         currentExons[1,1] = currentGene.exons[exon_idx][1]-currentGene.start
 
-      currentExons[1,0] = currentGene.exons[exon_idx][0]-currentGene.start
-      currentExons[1,1] = currentGene.exons[exon_idx][1]-currentGene.start
+         cut_offset = 500
 
-      cut_offset = 500
-      up_cut   = currentExons[0,0] - cut_offset
-      down_cut = currentExons[1,1] + cut_offset
+         up_cut   = int(currentExons[0,0]) - cut_offset
+         down_cut = int(currentExons[1,1]) + cut_offset
 
-      if up_cut < 0:
-         up_cut = 0
+         if up_cut < 0:
+            up_cut = 0
 
-      if down_cut > len(currentSeq):
-         down_cut = len(currentSeq)
+         if down_cut > len(currentSeq):
+            down_cut = len(currentSeq)
 
-      currentSeq = currentSeq[up_cut:down_cut]
+         currentSeq = currentSeq[up_cut:down_cut]
+
+      except:
+         pdb.set_trace()
 
       Sequences.append(currentSeq)
 
@@ -98,7 +104,7 @@ def paths_load_data_solexa(expt,genome_info,PAR):
    
    print 'found %d examples' % len(Sequences)
 
-   return Sequences, Acceptors, Donors, Exons, Ests, Qualities
+   return Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPositions
 
 
 def paths_load_data_pickle(expt,genome_info,PAR):
diff --git a/qpalma/paths_load_data.py b/qpalma/paths_load_data.py
deleted file mode 100644 (file)
index 586a5fd..0000000
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-
diff --git a/qpalma/paths_load_data_pickle.py b/qpalma/paths_load_data_pickle.py
deleted file mode 100644 (file)
index 586a5fd..0000000
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-
diff --git a/qpalma/paths_load_data_solexa.py b/qpalma/paths_load_data_solexa.py
deleted file mode 100644 (file)
index 200cbcd..0000000
+++ /dev/null
@@ -1,98 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-from numpy.matlib import mat,zeros,ones,inf
-import cPickle
-import pdb
-import Configuration as Conf
-from tools.PyGff import *
-
-
-def paths_load_data_solexa(expt,genome_info,PAR):
-   # expt can be 'training','validation' or 'test'
-   assert expt in ['training','validation','test']
-
-   dna_filename = Conf.dna_filename
-   est_filename = Conf.est_filename
-   tair7_seq_filename = Conf.tair7_seq_filename 
-
-   tair7_seq = cPickle.load(open(tair7_seq_filename))
-   allGenes  = cPickle.load(open(dna_filename))
-
-   Sequences = []
-   Acceptors = []
-   Donors = []
-   Exons = []
-   Ests = []
-   Qualities = []
-
-   for line in open(est_filename):
-      line = line.strip()
-      chr,strand,seq, splitpos, length, prb, cal, chastity, gene_id, exon_idx = line.split()
-      splitpos = int(splitpos)
-      length = int(length)
-      prb = [ord(elem)-50 for elem in prb]
-      cal = [ord(elem)-64 for elem in cal]
-      chastity = [ord(elem)+10 for elem in chastity]
-
-      assert len(prb) == len(seq)
-
-      currentGene = allGenes[gene_id]
-      seq = seq.lower()
-
-      try:
-         currentSeq = tair7_seq[gene_id+'.1']['sequence'].lower()
-      except:
-         continue
-         
-      #assert currentSize == len(Sequences[-1]), 'gene_size %d / seq size %d' % (currentSize,len(Sequences[-1]))
-      #Acceptors.append([-inf]*currentSize)
-      #Donors.append([-inf]*currentSize)
-
-      exon_idx = int(exon_idx)
-      currentExons = zeros((len(currentGene.exons),2))
-      #for idx in range(len(currentGene.exons)):
-      #   currentExons[idx,0] = currentGene.exons[idx][0]-currentGene.start
-      #   currentExons[idx,1] = currentGene.exons[idx][1]-currentGene.start
-
-      currentExons[0,0] = currentGene.exons[exon_idx-1][0]-currentGene.start
-      currentExons[0,1] = currentGene.exons[exon_idx-1][1]-currentGene.start
-
-      currentExons[1,0] = currentGene.exons[exon_idx][0]-currentGene.start
-      currentExons[1,1] = currentGene.exons[exon_idx][1]-currentGene.start
-
-      cut_offset = 500
-      up_cut   = currentExons[0,0] - cut_offset
-      down_cut = currentExons[1,1] + cut_offset
-
-      if up_cut < 0:
-         up_cut = 0
-
-      if down_cut > len(currentSeq):
-         down_cut = len(currentSeq)
-
-      currentSeq = currentSeq[up_cut:down_cut]
-
-      Sequences.append(currentSeq)
-
-      currentSize = len(Sequences[-1])
-      Acceptors.append([0]*currentSize)
-      Donors.append([0]*currentSize)
-
-      Exons.append(currentExons)
-      Ests.append(seq)
-      Qualities.append(prb)
-
-      SplitPositions.append(int(splitpos))
-
-      if len(Sequences[-1]) == 2755:
-         Sequences = Sequences[:-1]
-         Acceptors = Acceptors[:-1]
-         Donors    = Donors[:-1]
-         Exons     = Exons[:-1]
-         Ests      = Ests[:-1]
-         Qualities = Qualities[:-1]
-   
-   print 'found %d examples' % len(Sequences)
-
-   return Sequences, Acceptors, Donors, Exons, Ests, Qualities
index 771d4d6..4757209 100644 (file)
@@ -17,8 +17,9 @@ from numpy.matlib import mat,zeros,ones,inf
 from numpy.linalg import norm
 
 import QPalmaDP
-from SIQP_CPX import SIQPSolver
 
+import qpalma
+from qpalma.SIQP_CPX import SIQPSolver
 from qpalma.DataProc import *
 
 from generateEvaluationData import *
index 766ca0e..0c83f16 100644 (file)
@@ -17,23 +17,23 @@ from numpy.matlib import mat,zeros,ones,inf
 from numpy.linalg import norm
 
 import QPalmaDP
-from SIQP_CPX import SIQPSolver
 
+import qpalma
+from qpalma.SIQP_CPX import SIQPSolver
 from qpalma.DataProc import *
 
-from generateEvaluationData import *
+from qpalma.generateEvaluationData import *
+from qpalma.computeSpliceWeights import *
+from qpalma.set_param_palma import *
+from qpalma.computeSpliceAlignWithQuality import *
+from qpalma.penalty_lookup_new import *
+from qpalma.compute_donacc import *
+from qpalma.TrainingParam import Param
+from qpalma.export_param import *
 
-from computeSpliceWeights import *
-from set_param_palma import *
-from computeSpliceAlignWithQuality import *
-from penalty_lookup_new import *
-from compute_donacc import *
-from TrainingParam import Param
-from export_param import *
-
-import Configuration
-from Plif import Plf
-from Helpers import *
+import qpalma.Configuration
+from qpalma.Plif import Plf
+from qpalma.Helpers import *
 
 def getQualityFeatureCounts(qualityPlifs):
    weightQuality = qualityPlifs[0].penalties
@@ -49,14 +49,12 @@ class QPalma:
    
    def __init__(self):
       self.ARGS = Param()
-
       self.logfh = open('qpalma.log','w+')
-      gen_file= '%s/genome.config' % self.ARGS.basedir
-
-      ginfo_filename = 'genome_info.pickle'
-      self.genome_info = fetch_genome_info(ginfo_filename)
 
-      self.plog('genome_info.basedir is %s\n'%self.genome_info.basedir)
+      #gen_file= '%s/genome.config' % self.ARGS.basedir
+      #ginfo_filename = 'genome_info.pickle'
+      #self.genome_info = fetch_genome_info(ginfo_filename)
+      #self.plog('genome_info.basedir is %s\n'%self.genome_info.basedir)
 
       #self.ARGS.train_with_splicesitescoreinformation = False
 
@@ -71,7 +69,7 @@ class QPalma:
          Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
          use_quality_scores = False
       elif Configuration.mode == 'using_quality_scores':
-         Sequences, Acceptors, Donors, Exons, Ests, Qualities = paths_load_data_solexa('training',self.genome_info,self.ARGS)
+         Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data_solexa('training',None,self.ARGS)
 
          #Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
 
index 07c8613..68a9f8a 100644 (file)
@@ -262,10 +262,10 @@ void process_reads(FILE* reads_fs,struct gene*** allGenes,int numGenes, FILE* ou
             exon_idx++;
 
             if (ue != 0 && dov != 0)
-               combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs,gene_id,exon_idx);
+               combine_info(prev_exon_stop,cur_exon_start,upstream_end,ue,downstream_overlap,dov,out_fs,gene_id,exon_idx-1);
 
             if (uo != 0 && ds != 0) 
-               combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs,gene_id,exon_idx);
+               combine_info(prev_exon_stop,cur_exon_start,upstream_overlap,uo,downstream_start,ds,out_fs,gene_id,exon_idx-1);
 
             ue = uo = ds = dov = 0;
             goto exon_label;