+ cleaning up code base
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 25 Jun 2008 10:12:29 +0000 (10:12 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Wed, 25 Jun 2008 10:12:29 +0000 (10:12 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@9772 e1793c9e-67f9-0310-80fc-b846ff1f7b36

qpalma/Configuration.py
qpalma/DataProc.py [deleted file]
qpalma/TrainingParam.py [deleted file]
qpalma/computeSpliceAlignWithQuality.py
qpalma/export_param.py [deleted file]
qpalma/generateEvaluationData.py [deleted file]
qpalma/sequence_utils.py

index 7a84d66..e1cb4c7 100644 (file)
@@ -8,7 +8,7 @@ import cPickle
 # choose a path where all results of the QPalma pipeline will be stored
 #
 
-result_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run'
+result_dir = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/prediction'
 
 
 ###############################################################################
@@ -74,7 +74,8 @@ mode = 'using_quality_scores'
 # instead.
 ###############################################################################
 
-read_size = 36
+read_size = 38
+#read_size = 36
 extension = (250,500)
 
 numLengthSuppPoints  = 10
diff --git a/qpalma/DataProc.py b/qpalma/DataProc.py
deleted file mode 100644 (file)
index ce41f38..0000000
+++ /dev/null
@@ -1,272 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-from numpy.matlib import mat,zeros,ones,inf
-import cPickle
-import pdb
-#import pydb
-import Configuration as Conf
-import tools
-from tools.PyGff import *
-import io_pickle
-#import scipy.io
-import sys
-
-from genome_utils import load_genomic
-from Genefinding import *
-
-def paths_load_data(filename):
-
-   #data = io_pickle.load(filename)
-   data = cPickle.load(open(filename))
-
-   SeqInfo     = data[0]
-   Exons       = data[1]
-   OriginalEsts= data[2]
-   Qualities   = data[3]
-   AlternativeSequences = data[4]
-
-   return SeqInfo, Exons, OriginalEsts, Qualities,\
-   AlternativeSequences
-
-
-def paths_load_data_solexa(expt,genome_info,PAR,test=False):
-   """
-
-   """
-   # expt can be 'training','validation' or 'test'
-   assert expt in ['training','validation','test']
-
-   dna_filename = Conf.dna_filename
-   est_filename = Conf.est_filename
-
-   pdb.set_trace()
-
-   allGenes  = cPickle.load(open(dna_filename))
-
-   Sequences = []
-   Acceptors = []
-   Donors = []
-   Exons = []
-   Ests = []
-   Qualities = []
-   SplitPositions = []
-
-   rrp = RemappedReadParser('est_filename')
-
-   # Iterate over all reads
-   for line in open(est_filename):
-      line = line.strip()
-      #chr,strand,seq,splitpos,length,prb,cal,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
-      #chr,strand,seq,splitpos,length,prb,cal,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
-      id,chr,pos,strand,mismatches,align_len,offset,alignment = line.split()
-
-      splitpos = int(splitpos)
-      length = int(length)
-      prb = [ord(elem)-50 for elem in prb]
-      cal = [ord(elem)-64 for elem in cal]
-      chastity = [ord(elem)+10 for elem in chastity]
-
-      assert len(prb) == len(seq)
-
-      currentGene = allGenes[gene_id]
-      seq = seq.lower()
-
-      #try:
-      #   currentSeq = tair7_seq[gene_id+'.1']['sequence'].lower()
-      #except:
-      #   continue
-
-      genome_file = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
-      chrom    = 'chr1'
-
-      # use matlab-style functions to access dna sequence
-      dna = load_genomic(chrom,'+',currentGene.start,currentGene.stop,genome_file,one_based=True)
-      currentSeq = dna
-
-      if len(currentSeq) < (currentGene.stop - currentGene.start):
-         continue
-
-      # compare both sequences
-      #assert dna == currentSeq, pdb.set_trace()
-   
-      p_start = int(p_start)
-      exon_stop = int(exon_stop)
-      exon_start = int(exon_start)
-      p_stop = int(p_stop)
-
-      assert p_start < exon_stop < exon_start < p_stop, 'Invalid Indices'
-      assert exon_stop - p_start + p_stop - exon_start == 36, 'Invalid read size'
-      assert p_stop - p_start >= 36
-   
-      currentExons = zeros((2,2))
-
-      currentExons[0,0] = p_start
-      currentExons[0,1] = exon_stop
-
-      currentExons[1,0] = exon_start
-      currentExons[1,1] = p_stop
-
-      # make indices relative to start pos of the current gene
-      currentExons -= currentGene.start
-
-      # determine cutting positions for sequences
-      up_cut   = int(currentExons[0,0])
-      down_cut = int(currentExons[1,1])
-
-      # if we perform testing we cut a much wider region as we want to detect
-      # how good we perform on a region
-      if test:
-         up_cut = up_cut-500
-         if up_cut < 0:
-            up_cut = 0
-
-         down_cut = down_cut+500
-
-         if down_cut > len(currentSeq):
-            down_cut = len(currentSeq)
-
-      # cut out part of the sequence
-      currentSeq = currentSeq[up_cut:down_cut]
-
-      # ensure that index is correct
-      currentExons[0,1] += 1
-
-      if test:
-         currentExons -= up_cut
-      else:
-         currentExons -= currentExons[0,0]
-
-      try:
-         if not (currentSeq[int(currentExons[0,1])] == 'g' and\
-         currentSeq[int(currentExons[0,1])+1] in ['c','t' ]):
-            continue
-
-         if not (currentSeq[int(currentExons[1,0])-1] == 'g' and currentSeq[int(currentExons[1,0])-2] == 'a'):
-            continue
-
-      except:
-         pdb.set_trace()
-
-      Sequences.append(currentSeq)
-
-      # now we want to use interval_query to get the predicted splice scores
-      # trained on the TAIR sequence and annotation
-
-      
-      interval_matrix = createIntArrayFromList([currentGene.start+up_cut,currentGene.start+down_cut])
-      num_fields = 1
-      num_intervals = 1
-
-      # fetch acceptor scores
-      fname =  '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/acc/output_SVMWD/pred/contig_1+' 
-      gf = Genef()
-      num_hits = gf.query(fname, ['Conf'], num_fields, interval_matrix, num_intervals)
-      assert num_hits <= len(currentSeq)
-      
-      #print 'Acceptor hits: %d' % num_hits
-
-      pos = createIntArrayFromList([0]*num_hits)
-      indices = createIntArrayFromList([0]*num_hits)
-      scores = createDoubleArrayFromList([0]*num_hits*num_fields)
-      gf.getResults(pos,indices,scores)
-
-      currentAcceptors  = [-inf]*(len(currentSeq)+1)
-      
-      for i in range(num_hits):
-         position = pos[i] 
-         position -= (currentGene.start+up_cut)
-         assert 0 <= position < len(currentSeq)+1, 'position index wrong'
-         currentAcceptors[position] = scores[i]
-
-      currentAcceptors = currentAcceptors[1:] + [-inf]
-      Acceptors.append(currentAcceptors)
-
-      del gf
-
-      # fetch donor scores
-      fname =  '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_1+' 
-      gf = Genef()
-      num_hits = gf.query(fname, ['Conf'], num_fields, interval_matrix, num_intervals)
-      assert num_hits <= len(currentSeq)
-
-      #print 'Donor hits: %d' % num_hits
-      pos = createIntArrayFromList([0]*num_hits)
-      indices = createIntArrayFromList([0]*num_hits)
-      scores = createDoubleArrayFromList([0]*num_hits*num_fields)
-      gf.getResults(pos,indices,scores)
-
-      currentDonors     = [-inf]*(len(currentSeq))
-
-      try:
-         for i in range(1,num_hits):
-            position = pos[i] 
-            position -= (currentGene.start+up_cut)
-            currentDonors[position-1] = scores[i]
-      except:
-         pdb.set_trace()
-
-      currentDonors = [-inf] + currentDonors[:-1]
-      Donors.append(currentDonors)
-
-      Exons.append(currentExons)
-      Ests.append(seq)
-      Qualities.append(prb)
-
-      SplitPositions.append(splitpos)
-
-      #if len(Sequences[-1]) == 2755:
-      #   Sequences = Sequences[:-1]
-      #   Acceptors = Acceptors[:-1]
-      #   Donors    = Donors[:-1]
-      #   Exons     = Exons[:-1]
-      #   Ests      = Ests[:-1]
-      #   Qualities = Qualities[:-1]
-      #   SplitPositions = SplitPositions[:-1]
-   
-   #print 'found %d examples' % len(Sequences)
-
-   return Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPositions
-
-
-def paths_load_data_pickle(expt,genome_info,PAR):
-   # function [Sequences, Acceptors, Donors, Exons, Ests, Noises] = paths_load_data(expt,genome_info,PAR)
-   # Load the relevant file and return the alignment data
-
-   # expt can be 'training','validation' or 'test'
-
-   assert expt in ['training','validation','test']
-
-   tmp_dir = '/fml/ag-raetsch/home/fabio/tmp'
-
-   Noises = [];
-
-   if expt == 'training':
-      if PAR.microexon:
-         if PAR.LOCAL_ALIGN: # local version
-
-            train_data = '%s/microexon_train_data.pickle' % genome_info.basedir
-            data = cPickle.load(open(train_data))
-
-         else: # global version
-            pass
-
-
-      else:
-         train_data = '%s/exons_train_local.pickle' % genome_info.basedir
-         data = cPickle.load(open(train_data))
-
-      #print 'train_data is %s' % train_data
-
-      Sequences = data['Train']       # dna sequences
-      Acceptors = data['TrainAcc']    # acceptor scores
-      Donors    = data['TrainDon']    # donor scores
-      Exons     = data['TrainExon']   # exon boundaries
-      Ests      = data['TrainEsts']   # est sequences
-     
-   # Lower all indices by one to convert matlab 
-   # to python indices
-
-   Exons -= 1
-
-   return Sequences, Acceptors, Donors, Exons, Ests, Noises
diff --git a/qpalma/TrainingParam.py b/qpalma/TrainingParam.py
deleted file mode 100644 (file)
index 2d5b28b..0000000
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-class Param:
-   
-   def __init__(self):
-      """ default parameters """
-
-      self.basedir = '/fml/ag-raetsch/share/projects/qpalma/elegans_palma2'
-      #self.basedir = '/fml/ag-raetsch/share/projects/qpalma/zebrafish'
-      self.MAX_MEM = 31000;
-      self.LOCAL_ALIGN = 1;
-      self.init_param = 1;
-      self.train_with_splicesitescoreinformation = 1;
-      self.train_with_intronlengthinformation = 1;
-      self.C = 0.001;
-      self.microexon = 0;
-      self.prob = 0;
-      self.organism = 'elegans'
-      self.expt = 'training'
-         
-      self.insertion_prob  = self.prob/3 ;
-      self.deletion_prob   = self.prob/3 ;
-      self.mutation_prob   = self.prob/3 ;
index a998119..243bafa 100644 (file)
@@ -6,7 +6,7 @@ from numpy.matlib import mat,zeros,ones,inf
 import pdb
 from Plif import Plf,base_coord,linspace
 
-import Helpers
+import sequence_utils
 
 def  computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, qualityPlifs, run):
    """
@@ -98,10 +98,10 @@ def  computeSpliceAlignWithQuality(dna, exons, est, original_est, quality, quali
          orig_pos += 1
 
    for orig_char in dna_part:
-      assert orig_char in Helpers.alphabet, pdb.set_trace()
+      assert orig_char in sequence_utils.alphabet, pdb.set_trace()
 
    for orig_char in est_part:
-      assert orig_char in Helpers.alphabet, pdb.set_trace()
+      assert orig_char in sequence_utils.alphabet, pdb.set_trace()
 
    trueWeightMatch = zeros((run['matchmatrixRows']*run['matchmatrixCols'],1)) # Scorematrix fuer Wahrheit
 
diff --git a/qpalma/export_param.py b/qpalma/export_param.py
deleted file mode 100644 (file)
index 57d2c94..0000000
+++ /dev/null
@@ -1,52 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-###########################################################
-#
-###########################################################
-
-import bz2
-
-def writeStruct(fid,plif):
-   fid.write('%s_limits=%s\n'%(plif.name,str(plif.limits)))
-   fid.write('%s_penalties=%s\n'%(plif.name,str(plif.penalties)))
-   fid.write('%s_bins=%d\n'%(plif.name,len(plif.limits)))
-
-   if plif.name == 'intron':
-      fid.write('%s_len_limits=%s\n'%(plif.name,str(plif.limits)))
-      fid.write('%s_len_penalties=%s\n'%(plif.name,str(plif.penalties)))
-      fid.write('%s_len_bins=%d\n'%(plif.name,len(plif.limits)))
-      fid.write('%s_len_min=%d\n'%(plif.name,plif.min_len))
-      fid.write('%s_len_max=%d\n'%(plif.name,plif.max_len))
-      fid.write('%s_len_transform=%s\n'%(plif.name,plif.transform))
-
-def export_param(filename,h,d,a,mmatrix,qualityPlifs):
-
-   # Exports a bz2 file with the trained PALMA. Assumes splice sites and intron length used.
-   h.name = 'intron'
-   d.name = 'donor'
-   a.name = 'acceptor'
-
-   fid = bz2.BZ2File(filename+'.bz2','w')
-
-   fid.write('%palma definition file version: 1.0\n\n')
-   fid.write('%penalties\n');
-  
-   writeStruct(fid, h);
-   writeStruct(fid, a);
-   writeStruct(fid, d);
-
-   # substitution matrix
-   mmatrix = mmatrix.reshape(6,6)
-   fid.write('substitution_matrix=[')
-   for row in range(6):
-      if row == 5:
-         fid.write('%f, %f, %f, %f, %f, %f]\n'%(mmatrix[row,0],mmatrix[row,1],mmatrix[row,2],mmatrix[row,3],mmatrix[row,4],mmatrix[row,5]))
-      else:
-         fid.write('%f, %f, %f, %f, %f, %f;\n'%(mmatrix[row,0],mmatrix[row,1],mmatrix[row,2],mmatrix[row,3],mmatrix[row,4],mmatrix[row,5]))
-
-   
-   for elem in qualityPlifs:
-      writeStruct(fid, elem);
-
-   fid.close()
diff --git a/qpalma/generateEvaluationData.py b/qpalma/generateEvaluationData.py
deleted file mode 100644 (file)
index 6814855..0000000
+++ /dev/null
@@ -1,146 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-from numpy.matlib import mat,zeros,ones,inf
-import random
-import pdb
-import os.path
-import cPickle
-
-class Dataset:
-  pass
-
-def sample(population, k):
-   """Chooses k random elements from a population sequence. """
-   n = len(population)
-   result = [None] * k
-   for i in xrange(k):
-      j = int(random.random() * n)
-      result[i] = population[j]
-   return result
-
-def generateData(numExamples):
-   dna_len = 216
-   est_len = 36
-   random.seed(14)
-
-   letters = ['a','c','g','t']
-
-   Sequences   = []
-   Acceptors   = []
-   Donors      = []
-   Exons       = []
-   Ests        = []
-
-   for i in range(numExamples):
-      dna = ''.join(sample(letters, dna_len))
-      Sequences.append(dna)
-
-      Acceptors.append([0.0]*dna_len)
-      Donors.append([0.0]*dna_len)
-
-      currentExon       = zeros((2,2))
-      currentExon[0,0]  = 0
-      currentExon[0,1]  = 72
-      currentExon[1,0]  = 144
-      currentExon[1,1]  = 216
-
-      Exons.append(currentExon)
-
-      est = ''.join(sample(letters, est_len))
-      Ests.append(est)
-
-   preNr    = 15
-   middleNr = 6
-   sufNr    = 15
-   Qualities  = [[40]*preNr + [-1]*middleNr + [40]*sufNr]*numExamples
-
-   return Sequences, Acceptors, Donors, Exons, Ests, Qualities
-
-def generateData2(numExamples):
-   est_len = 36
-   random.seed(14)
-
-   letters = ['a','c','g','t']
-
-   Sequences   = []
-   Acceptors   = []
-   Donors      = []
-   Exons       = []
-   Ests        = []
-
-   for i in range(numExamples):
-      dna_len = random.randint(200,400)
-      dna = ''.join(sample(letters, dna_len))
-      #Sequences.append(dna)
-      begin = random.randint(70,dna_len-70)
-      end   = begin+60
-      dna = dna[0:begin] + 't'*18 + 'gt' + 'a'*20 + 'ag' + 't'*18 + dna[end:]
-      dna_len = len(dna)
-      Sequences.append(dna)
-      
-      currentDon  = [-3.0]*dna_len
-      currentAcc  = [-3.0]*dna_len
-
-      currentDon[begin+18+1] = 3.0
-      currentDon[begin+18+2] = 3.0
-
-      currentAcc[end-18-1] = 3.0
-      currentAcc[end-18-2] = 3.0
-
-      #pdb.set_trace()
-
-      Donors.append(currentDon)
-      Acceptors.append(currentAcc)
-
-      currentExon       = zeros((2,2))
-      currentExon[0,0]  = 0
-      currentExon[0,1]  = begin+18
-      currentExon[1,0]  = end-18
-      currentExon[1,1]  = dna_len-1
-
-      Exons.append(currentExon)
-
-      #est = ''.join(sample(letters, est_len))
-      est = dna[begin-18:begin] + dna[end+1:end+19]
-      est = 't'*est_len
-      Ests.append(est)
-
-   preNr    = 15
-   middleNr = 6
-   sufNr    = 15
-   #Qualities  = [[40]*preNr + [-1]*middleNr + [40]*sufNr]*numExamples
-   Qualities  = [[40]*est_len]*numExamples
-
-   return Sequences, Acceptors, Donors, Exons, Ests, Qualities
-
-def loadArtificialData(numExamples):
-   filename = 'artificial_dset_%d'%numExamples
-   if not os.path.exists(filename):
-      s,a,d,e,est,q = generateData2(numExamples)
-      dset = Dataset()
-      dset.Sequences = s
-      dset.Acceptor  = a
-      dset.Donors    = d
-      dset.Exons     = e
-      dset.Ests      = est
-      dset.Qualities = q
-      cPickle.dump(dset,open(filename,'w+'))
-   else:
-      dset = cPickle.load(open(filename))
-      s = dset.Sequences
-      a = dset.Acceptor
-      d = dset.Donors
-      e =  dset.Exons
-      est = dset.Ests
-      q = dset.Qualities
-
-   return s,a,d,e,est,q
-
-if __name__ == '__main__':
-   Sequences, Acceptors, Donors, Exons, Ests, Qualities = generateData(10)
-   print Acceptors
-   print Donors
-   print Exons
-   print Ests
-   print Qualities
index 693d7df..9a8e983 100644 (file)
@@ -18,6 +18,8 @@ from numpy.matlib import inf
 from Genefinding import *
 from genome_utils import load_genomic
 
+extended_alphabet = ['-','a','c','g','t','n','[',']']
+alphabet          = ['-','a','c','g','t','n']
 
 def get_flatfile_size(filename):
    cmd =  'wc -c %s | cut -f1 -d \' \'' % filename
@@ -43,7 +45,7 @@ def reverse_complement(seq):
    """
 
    bpos = seq.find('[')
-   rc = lambda x: {'a':'t','c':'g','g':'c','t':'a'}[x]
+   rc = lambda x: {'a':'t','c':'g','g':'c','t':'a','n':'n'}[x]
 
    # check first whether seq contains no brackets at all
    if bpos == -1:
@@ -51,7 +53,7 @@ def reverse_complement(seq):
       ret_val.reverse()
       ret_val = "".join(ret_val)
    else:
-      brc = lambda x: {'a':'t','c':'g','g':'c','t':'a','[':'[',']':']'}[x]
+      brc = lambda x: {'a':'t','c':'g','g':'c','t':'a','n':'n','[':'[',']':']'}[x]
 
       # first_part is the part of the seq up to the first occurrence of a
       # bracket
@@ -220,7 +222,7 @@ def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_file
       assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
       assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
       assert len(currentAcc) == len(currentDon), pdb.set_trace()
-   else
+   else:
       for pos in ag_tuple_pos:
          if currentAcc[pos] == -inf:
             currentAcc[pos] = 0.01