+ minor bugfixes in the compile_dataset script
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Tue, 5 Feb 2008 11:10:45 +0000 (11:10 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Tue, 5 Feb 2008 11:10:45 +0000 (11:10 +0000)
+ added parameters to Configuration script
+ changed representation of entries in the parser

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7683 e1793c9e-67f9-0310-80fc-b846ff1f7b36

qpalma/Configuration.py
qpalma/parsers.py
qpalma/set_param_palma.py
scripts/compile_dataset.py
scripts/qpalma_main.py

index c6f1ab2..1d07457 100644 (file)
@@ -20,7 +20,11 @@ fixedParamQ = cPickle.load(open('/fml/ag-raetsch/home/fabio/svn/projects/QPalma/
 #
 C = 1
 
+min_intron_len = 10
+max_intron_len = 1000
 
+min_svm_score = 0.0 
+max_svm_score = 1.0
 
 ###############################################################################
 # 
@@ -128,8 +132,8 @@ else:
 training_begin    =    0
 training_end      = 1500
 
-prediction_begin  = 1500
-prediction_end    = 2200
+prediction_begin  = 0
+prediction_end    = 1500
 
 joinp = os.path.join
 
index a0b968d..03bc912 100644 (file)
@@ -43,6 +43,16 @@ class FilteredReadParser(ReadParser):
       splitpos = int(splitpos)
       read_size = int(read_size)
 
+      assert strand in ['D','P']
+
+      if strand == 'D':
+         strand = '+'
+
+      if strand == 'P':
+         strand = '-'
+
+      chr = int(chr)
+
       prb = [ord(elem)-50 for elem in prb]
       cal_prb = [ord(elem)-64 for elem in cal_prb]
       chastity = [ord(elem)+10 for elem in chastity]
index 47462f3..7bbb0da 100644 (file)
@@ -8,55 +8,16 @@ import pdb
 import Configuration
 from Plif import *
 
-def set_params_pa():
-   h = plf()
-   h.len = int(model.intron_len_bins) 
-   h.limits = model.intron_len_limits 
-   h.penalties = model.intron_len_penalties 
-   h.name = 'h'
-   h.max_len   = int(max_intron_len)
-   h.min_len   = int(min_intron_len)
-   h.transform = model.intron_len_transform
-
-   d = plf()
-   d.len = int(model.donor_bins)
-   d.limits = model.donor_limits
-   d.penalties = model.donor_penalties 
-   d.name = 'd'
-   d.max_len   = 100
-   d.min_len   = -100
-
-   a = plf()
-   a.len = int(model.acceptor_bins)
-   a.limits = model.acceptor_limits 
-   a.penalties = model.acceptor_penalties 
-   a.name = 'a'
-   a.max_len   = 100
-   a.min_len   = -100
-
-   mmatrix = model.substitution_matrix 
-
-
 def set_param_palma(param, train_with_intronlengthinformation,\
                                                min_intron_len=None, max_intron_len=None, min_svm_score=None, max_svm_score=None):
 
    print 'Setting parameters ...'
    
-   #if min_intron_len == None:
-   #   if train_with_intronlengthinformation:
-   #      min_intron_len=20
-   #      max_intron_len=1000
-   #   else:
-   #      min_intron_len = 1
-   #      max_intron_len = 2
-   min_intron_len=4
-   max_intron_len=10000
+   min_intron_len = Conf.min_intron_len 
+   max_intron_len = Conf.max_intron_len
 
-   #if min_intron_len != None and max_intron_len != None:
-   #   min_svm_score=-5
-   #   max_svm_score=5
-   min_svm_score=-3
-   max_svm_score=1
+   min_svm_score = Conf.min_svm_score 
+   max_svm_score = Conf.max_svm_score 
 
    h = Plf()
    d = Plf()
@@ -73,8 +34,7 @@ def set_param_palma(param, train_with_intronlengthinformation,\
    ####################
    # Gapfunktion
    ####################
-   h.limits    = logspace(math.log(min_intron_len,10),math.log(max_intron_len,10),30) 
-   #h.limits    = linspace(min_intron_len,max_intron_len,30)
+   h.limits    = logspace(math.log(min_intron_len,10),math.log(max_intron_len,10),lengthSP) 
    h.penalties = param[0:lengthSP].flatten().tolist()[0]
    #h.transform = '+1' 
    h.transform = '' 
@@ -88,7 +48,7 @@ def set_param_palma(param, train_with_intronlengthinformation,\
    ####################
    # Donorfunktion
    ####################
-   d.limits    = linspace(min_svm_score,max_svm_score,30)
+   d.limits    = linspace(min_svm_score,max_svm_score,donSP)
    d.penalties = param[lengthSP:lengthSP+donSP].flatten().tolist()[0]
    #d.transform = '+1' 
    d.transform = '' 
@@ -102,7 +62,7 @@ def set_param_palma(param, train_with_intronlengthinformation,\
    ####################
    # Acceptorfunktion
    ####################
-   a.limits    = linspace(min_svm_score,max_svm_score,30)
+   a.limits    = linspace(min_svm_score,max_svm_score,accSP)
    a.penalties = param[lengthSP+donSP:lengthSP+donSP+accSP].flatten().tolist()[0]
    #a.transform = '+1' 
    a.transform = '' 
index 30b651c..1bfafd3 100644 (file)
@@ -100,12 +100,9 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
 
       gene_id = currentFRead['gene_id']
       currentGene = allGenes[gene_id]
-
-      chrom             = 'chr1'
-      genome_file       = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
-      sscore_filename   = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_1+' 
-
-      seq, acc, don, exons, est, qual, spos = process_read(currentRReads,currentFRead,currentGene,chrom,genome_file,sscore_filename)
+   
+      #seq, acc, don, exons, est, qual, spos = process_read(currentRReads,currentFRead,currentGene,genome_file,sscore_filename)
+      seq, acc, don, exons, est, qual, spos = process_read(currentRReads,currentFRead,currentGene)
       
       if seq == '':
          continue
@@ -127,16 +124,27 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
    io_pickle.save(dataset_file,dataset)
 
 
-def process_read(reReads,fRead,currentGene,chrom,genome_file,sscore_filename):
+#def process_read(reReads,fRead,currentGene,genome_file,sscore_filename):
+def process_read(reReads,fRead,currentGene):
    """
    The purpose of this function is to create an example for QPalma 
    by using a 
 
    """
+   chr = fRead['chr']
+   strand = fRead['strand']
+
+   chrom             = 'chr%d' % chr
+   genome_file       = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+   sscore_filename   = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_%d%s' % (chr,strand)
+
    # use matlab-style functions to access dna sequence
-   currentSeq = load_genomic(chrom,'+',currentGene.start,currentGene.stop,genome_file,one_based=True)
+   currentSeq = load_genomic(chrom,strand,currentGene.start,currentGene.stop,genome_file,one_based=True)
    currentSeq = currentSeq.lower()
 
+   if strand == '-':
+      currentSeq = reverse_complement(currentSeq)
+
    nil = ('','','','','','','')
 
    if len(currentSeq) < (currentGene.stop - currentGene.start):
@@ -151,6 +159,9 @@ def process_read(reReads,fRead,currentGene,chrom,genome_file,sscore_filename):
    quality = fRead['prb']
    spos = fRead['splitpos']
 
+   #if fRead['chr'] != 1 and fRead['strand'] != 'D':
+   #   return nil
+
    assert p_start < exon_stop < exon_start < p_stop, 'Invalid Indices'
    assert exon_stop - p_start + p_stop - exon_start == 36, 'Invalid read size'
    assert p_stop - p_start >= 36
@@ -209,9 +220,12 @@ def process_read(reReads,fRead,currentGene,chrom,genome_file,sscore_filename):
    # now we want to use interval_query to get the predicted splice scores
    # trained on the TAIR sequence and annotation
    
-   interval_matrix = createIntArrayFromList([currentGene.start+up_cut,currentGene.start+down_cut])
+   interval_matrix = createIntArrayFromList([currentGene.start+up_cut-100,currentGene.start+down_cut+100])
    num_fields = 1
    num_intervals = 1
+   sscore_filename   = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/acc/output_SVMWD/pred/contig_1+' 
+
+   ag_tuple_pos = [p for p,e in enumerate(currentSeq) if p>0 and p<len(currentSeq)-1 and currentSeq[p-1]=='a' and e=='g' ]
 
    # fetch acceptor scores
    gf = Genef()
@@ -224,12 +238,14 @@ def process_read(reReads,fRead,currentGene,chrom,genome_file,sscore_filename):
    scores = createDoubleArrayFromList([0]*num_hits*num_fields)
    gf.getResults(pos,indices,scores)
 
-   acc  = [-inf]*(len(currentSeq)+1)
-   
+   acc  = [-inf]*(len(currentSeq))
+
    for i in range(num_hits):
       position = pos[i] 
       position -= (currentGene.start+up_cut)
-      assert 0 <= position < len(currentSeq)+1, 'position index wrong'
+      if position < 2 or position > len(currentSeq)-1:
+         continue
+      assert 0 <= position < len(currentSeq), 'position index wrong'
       acc[position] = scores[i]
 
    acc = acc[1:] + [-inf]
@@ -237,10 +253,13 @@ def process_read(reReads,fRead,currentGene,chrom,genome_file,sscore_filename):
    del gf
 
    # fetch donor scores
+   sscore_filename   = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_1+' 
    gf = Genef()
    num_hits = gf.query(sscore_filename, ['Conf'], num_fields, interval_matrix, num_intervals)
    assert num_hits <= len(currentSeq)
 
+   gt_tuple_pos = [p for p,e in enumerate(currentSeq) if p>0 and p<len(currentSeq)-1 and e=='g' and (currentSeq[p+1]=='t' or currentSeq[p+1]=='c')]
+
    #print 'Donor hits: %d' % num_hits
    pos = createIntArrayFromList([0]*num_hits)
    indices = createIntArrayFromList([0]*num_hits)
@@ -250,22 +269,42 @@ def process_read(reReads,fRead,currentGene,chrom,genome_file,sscore_filename):
    don     = [-inf]*(len(currentSeq))
 
    try:
-      for i in range(1,num_hits):
+      for i in range(num_hits):
          position = pos[i] 
          position -= (currentGene.start+up_cut)
+         if position < 1 or position >= len(currentSeq)-1:
+            continue
          don[position-1] = scores[i]
    except:
       pdb.set_trace()
 
    don = [-inf] + don[:-1]
 
+   #for p,e in enumerate(acc):
+   #   if p not in ag_tuple_pos:
+   #      acc[p] = -inf
+
+   #for p,e in enumerate(don):
+   #   if p not in gt_tuple_pos:
+   #      don[p] = -inf
+
+   assert ag_tuple_pos == [p for p,e in enumerate(acc) if e != -inf], pdb.set_trace()
+   assert gt_tuple_pos == [p for p,e in enumerate(don) if e != -inf], pdb.set_trace()
+
+   assert (len(currentSeq)) == len(acc), pdb.set_trace()
+   assert (len(currentSeq)) == len(don), pdb.set_trace()
+
+   acc = [-inf] + acc[:-1]
+
    return currentSeq, acc, don, currentExons, currentReadSeq, quality, spos
 
+
 def reverse_complement(seq):
    map = {'a':'t','c':'g','g':'c','t':'a'}
 
    new_seq = [map[elem] for elem in seq]
    new_seq.reverse()
+   new_seq = "".join(new_seq)
 
    return new_seq
    
index 34ebda3..53d509a 100644 (file)
@@ -51,9 +51,9 @@ class QPalma:
    def __init__(self):
       self.ARGS = Param()
 
-      from compile_dataset import compile_d
+      #from compile_dataset import compile_d
+      #compile_d(Conf.gff_fn,Conf.dna_flat_fn,Conf.filtered_fn,Conf.remapped_fn,Conf.tmp_dir,Conf.dataset_fn)
 
-      compile_d(Conf.gff_fn,Conf.dna_flat_fn,Conf.filtered_fn,Conf.remapped_fn,Conf.tmp_dir,Conf.dataset_fn)
       #gen_file= '%s/genome.config' % self.ARGS.basedir
       #ginfo_filename = 'genome_info.pickle'
       #self.genome_info = fetch_genome_info(ginfo_filename)
@@ -82,14 +82,28 @@ class QPalma:
          self.Donors      = Donors
          self.Acceptors   = Acceptors
 
-         #self.Acceptors   = self.Acceptors[:end]
-         #self.Donors      = self.Donors[:end]
+         min_score = 100
+         max_score = -100
+
+         for elem in self.Acceptors:
+            for score in elem:
+               if score != -inf:
+                  min_score = min(min_score,score)
+                  max_score = max(max_score,score)
+
+         print 'Acceptors max/min: %f / %f' % (max_score,min_score)
+
+         min_score = 100
+         max_score = -100
+
+         for elem in self.Donors:
+            for score in elem:
+               if score != -inf:
+                  min_score = min(min_score,score)
+                  max_score = max(max_score,score)
+
+         print 'Donor max/min: %f / %f' % (max_score,min_score)
 
-         #Sequences, Acceptors, Donors, Exons, Ests, Qualities = loadArtificialData(1000)
-         #Sequences, Acceptors, Donors, Exons, Ests, Noises = paths_load_data_pickle('training',self.genome_info,self.ARGS)
-         #Qualities = []
-         #for i in range(len(Ests)):
-         #   Qualities.append([40]*len(Ests[i]))
          self.use_quality_scores = True
       else:
          assert(False)
@@ -496,8 +510,7 @@ class QPalma:
       print_matrix            = Conf.print_matrix 
       anzpath                 = Conf.anzpath 
 
-      #param_filename='/fml/ag-raetsch/home/fabio/svn/projects/QPalma/scripts/param_25.pickle'
-      param = load_param(param_filename)
+      param = cPickle.load(open(param_filename))
 
       # Set the parameters such as limits penalties for the Plifs
       [h,d,a,mmatrix,qualityPlifs] = set_param_palma(param,self.ARGS.train_with_intronlengthinformation)
@@ -700,13 +713,19 @@ class QPalma:
       for idx in range(len(eBegin_neg)):
          mean_eBegin_neg += eBegin_neg[idx]
          
-      mean_eBegin_neg /= 1.0*len(eBegin_neg)
+      try:
+         mean_eBegin_neg /= 1.0*len(eBegin_neg)
+      except:
+         mean_eBegin_neg = -1
 
       mean_eEnd_neg = 0
       for idx in range(len(eEnd_neg)):
          mean_eEnd_neg += eEnd_neg[idx]
 
-      mean_eEnd_neg /= 1.0*len(eEnd_neg)
+      try:
+         mean_eEnd_neg /= 1.0*len(eEnd_neg)
+      except:
+         mean_eEnd_neg = -1
 
       return eBegin_pos,eBegin_neg,eEnd_pos,eEnd_neg,mean_eBegin_neg,mean_eEnd_neg
 
@@ -731,44 +750,6 @@ def fetch_genome_info(ginfo_filename):
    else:
       return cPickle.load(open(ginfo_filename))
 
-
-def plifs2param(h,d,a,mmatrix,qualityPlifs):
-   donSP       = Conf.numDonSuppPoints
-   accSP       = Conf.numAccSuppPoints
-   lengthSP    = Conf.numLengthSuppPoints
-   mmatrixSP   = Conf.sizeMatchmatrix[0]\
-   *Conf.sizeMatchmatrix[1]
-
-
-   param = zeros((Conf.numFeatures,1))
-   param[0:donSP]                                               = mat(d.penalties[:]).reshape(donSP,1)
-   param[donSP:donSP+accSP]                                     = mat(a.penalties[:]).reshape(accSP,1)
-   param[donSP+accSP:donSP+accSP+lengthSP]                      = mat(h.penalties[:]).reshape(lengthSP,1)
-   param[donSP+accSP+lengthSP:donSP+accSP+lengthSP+mmatrixSP]   = mmatrix[:]
-
-   for idx in range(len(qualityPlifs)):
-      currentPlif       = qualityPlifs[idx]
-      begin             = lengthSP+donSP+accSP+mmatrixSP+(idx*Conf.numQualSuppPoints)
-      end               = lengthSP+donSP+accSP+mmatrixSP+((idx+1)*Conf.numQualSuppPoints)
-      param[begin:end]  = mat(currentPlif.penalties).reshape(Conf.numQualSuppPoints,1)
-
-   return param
-
-
-def load_param(filename):
-   param = None 
-   #try:
-   #   para = cPickle.load(open(filename))
-   #except:
-   #   print 'Error: Could not open parameter file!'
-   #   sys.exit(1)
-   #
-   #param = plifs2param(para.h,para.d,para.a,para.mmatrix,para.qualityPlifs)
-
-   param = cPickle.load(open(filename))
-   return param
-
-
 def evaluateExample(dna,est,exons,SpliceAlign,newEstAlign,spos):
    newExons = []
    oldElem = -1
@@ -791,10 +772,6 @@ def evaluateExample(dna,est,exons,SpliceAlign,newEstAlign,spos):
    if len(newExons) == 4:
       e1_begin,e1_end = newExons[0],newExons[1]
       e2_begin,e2_end = newExons[2],newExons[3]
-
-   #elif len(newExons) > 4 :
-   #   e1_begin,e1_end = newExons[0],newExons[1]
-   #   e2_begin,e2_end = newExons[-2],newExons[-1]
    else:
       return None,None,None,None,newExons