+ refactored compile dataset -> removed remaining redundancies in dataset object
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 6 Mar 2008 17:34:53 +0000 (17:34 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 6 Mar 2008 17:34:53 +0000 (17:34 +0000)
+ added sanity check for data

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@8085 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/Evaluation.py
scripts/check_dataset.py [new file with mode: 0644]
scripts/compile_dataset.py
scripts/qpalma_main.py

index 5faf83d..706d052 100644 (file)
@@ -86,22 +86,36 @@ def prediction_on(filename):
    allWrongExons  = []
    allDoubleScores = []
 
-   all_pos_correct_ctr = 0
+   pos_correct_ctr   = 0
+   pos_incorrect_ctr = 0
+   score_correct_ctr = 0
+   score_incorrect_ctr = 0
 
    for current_example_pred in allPredictions:
-      for elem_nr,current_pred in enumerate(current_example_pred[:1]):
+      gt_example = current_example_pred[0]
+      gt_score = gt_example['DPScores'].flatten().tolist()[0][0]
+      gt_correct = evaluate_unmapped_example(gt_example)
 
-         #gt_score = gt_example[] 
-         #score = gt_example['DPScores'].flatten().tolist()[0][0]
+      for elem_nr,current_pred in enumerate(current_example_pred[1:]):
 
-         #all_pos_correct = evaluate_example(current_pred)
-         all_pos_correct = evaluate_unmapped_example(current_pred)
+         current_label,comparison_result,current_score = evaluate_example(current_pred)
 
-         if all_pos_correct:
-            all_pos_correct_ctr += 1
+         if current_label:
+            if comparison_result:
+               pos_correct_ctr += 1
+            else:
+               pos_incorrect_ctr += 1
+         else:
+            if gt_correct and current_label == False and gt_score >= current_score:
+               score_correct_ctr += 1
+            elif gt_correct and current_label == False and not (gt_score >= current_score):
+               score_incorrect_ctr += 1
 
    print 'Total num. of examples: %d' % len(allPredictions)
-   print 'Number of total correct examples: %d' % all_pos_correct_ctr
+   print 'Number of correct examples: %d' % pos_correct_ctr
+   print 'Number of incorrect examples: %d' % pos_incorrect_ctr
+   print 'Number of correct scores : %d' % score_correct_ctr
+   print 'Number of incorrect scores: %d' % score_incorrect_ctr
 
    #print 'Correct positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_pos),len(e1End_pos),len(e2Begin_pos),len(e2End_pos))
    #print 'Incorrect positions:\t\t%d\t%d\t%d\t%d' % (len(e1Begin_neg),len(e1End_neg),len(e2Begin_neg),len(e2End_neg))
@@ -116,7 +130,6 @@ def evaluate_unmapped_example(current_prediction):
 
 
 def evaluate_example(current_prediction):
-   
    label = False
 
    try:
@@ -124,37 +137,42 @@ def evaluate_example(current_prediction):
    except:
       pass
 
-   #predExons = current_prediction['predExons']
-   #trueExons = current_prediction['trueExons']
-   #pdb.set_trace()
-
-   if label:
-
-      start_pos = current_prediction['start_pos']
-      alternative_start_pos = current_prediction['alternative_start_pos']
-
-      predExons = current_prediction['predExons']
-      trueExons = current_prediction['trueExons']
-
-      #pdb.set_trace()
-      
-      if start_pos <= alternative_start_pos:
-         start_offset = alternative_start_pos-start_pos
-         predExons += start_offset
-
-         return compare_exons(predExons,trueExons)
-
-      if alternative_start_pos <= start_pos:
-         start_offset = start_pos-alternative_start_pos
-         predExons += start_offset
-
-         return compare_exons(predExons,trueExons)
-
-   else:
-      pred_score = current_prediction['DPScores'].flatten().tolist()[0][0]
-
-      return False
+   predExons = current_prediction['predExons']
+   trueExons = current_prediction['trueExons']
 
+   predPositions = [elem + current_prediction['alternative_start_pos'] for elem in predExons]
+   truePositions = [elem + current_prediction['start_pos'] for elem in trueExons.flatten().tolist()[0]]
+   #pdb.set_trace()
+   
+   pred_score = current_prediction['DPScores'].flatten().tolist()[0][0]
+
+   altPredPositions = [0]*4
+   if len(predPositions) == 4:
+      altPredPositions[0] = predPositions[0]
+      altPredPositions[1] = predPositions[1]+1
+      altPredPositions[2] = predPositions[2]+1
+      altPredPositions[3] = predPositions[3]
+
+   altPredPositions2 = [0]*4
+   if len(predPositions) == 4:
+      altPredPositions2[0] = predPositions[0]
+      altPredPositions2[1] = predPositions[1]
+      altPredPositions2[2] = predPositions[2]+1
+      altPredPositions2[3] = predPositions[3]
+
+   altPredPositions3 = [0]*4
+   if len(predPositions) == 4:
+      altPredPositions3[0] = predPositions[0]
+      altPredPositions3[1] = predPositions[1]
+      altPredPositions3[2] = predPositions[2]-1
+      altPredPositions3[3] = predPositions[3]
+
+   pos_comparison = (predPositions == truePositions or altPredPositions == truePositions or altPredPositions2 == truePositions or altPredPositions3 == truePositions)
+
+   if label == True and pos_comparison == False:
+      pdb.set_trace()
+
+   return label,pos_comparison,pred_score
 
 def compare_exons(predExons,trueExons):
    e1_b_off,e1_e_off,e2_b_off,e2_e_off = 0,0,0,0
diff --git a/scripts/check_dataset.py b/scripts/check_dataset.py
new file mode 100644 (file)
index 0000000..0a56f5b
--- /dev/null
@@ -0,0 +1,50 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+import pdb
+import cPickle
+
+from compile_dataset import compile_d,unbracket_est,get_seq_and_scores,get_seq
+
+def check(idx):
+   pass
+
+def checkAll(filename):
+   dataset = cPickle.load(open(filename))
+   [SeqInfo, Exons, OriginalEsts, Qualities, AlternativeSequences] = dataset 
+
+   for idx in range(len(SeqInfo)):
+      currentExon = Exons[idx]
+      currentEst  = OriginalEsts[idx]
+      originalEst  = OriginalEsts[idx]
+
+      if currentEst.find('[') != -1:
+         #print currentEst
+         currentEst = unbracket_est(currentEst)
+         #print currentEst
+
+      assert len(currentEst) == 36, pdb.set_trace()
+
+      first_seq = get_seq( currentExon[0,0], currentExon[0,1], True )
+      end = first_seq[-2:]
+      first_seq = first_seq[:-2]
+      second_seq = get_seq( currentExon[1,0], currentExon[1,1]+1, False )
+      begin = second_seq[:2]
+      second_seq = second_seq[2:]
+
+      assert (len(first_seq) + len(second_seq)) == 36, pdb.set_trace()
+
+      if not (end == 'gt' or end == 'gc'):
+         print 'invalid donor at line %d'% ctr
+         #invalid_donor_ctr += 1
+         #continue
+      if not (begin == 'ag'):
+         print 'invalid acceptor at line %d'% ctr
+
+   pdb.set_trace() 
+
+
+if __name__ == '__main__':
+   checkAll(sys.argv[1])
+
index 57467f9..8cf6ec6 100644 (file)
@@ -6,8 +6,7 @@ import random
 import os
 import re
 import pdb
-#import pydb
-import io_pickle
+import cPickle
 
 import numpy
 from numpy.matlib import mat,zeros,ones,inf
@@ -115,20 +114,10 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
    print 'found %d remapped reads' % size_rem
 
    # this stores the new dataset
-   Sequences = []
-   Acceptors = []
-   Donors = []
+   SeqInfo = []
    Exons = []
-   Ests = []
    OriginalEsts = []
    Qualities = []
-   UpCut = []
-   StartPos = []
-   SplitPositions = []
-   Ids = []
-   FReads = []
-   RemappedReads = []
-
    AlternativeSequences = []
 
    # Iterate over all remapped reads in order to generate for each read an
@@ -146,7 +135,6 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
          try:
             gene_id = currentFRead['gene_id']
             chro = currentFRead['chr']
-            currentGene = allGenes[chro][gene_id]
          except:
             continue
 
@@ -154,28 +142,22 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
             #print 'wrong strand'
             continue
 
-         seq, acc, don, exons, est, qual, up_cut, start_pos =\
-            process_filtered_read(currentFRead,currentGene,dna_flat_files,test)
+         seq_info, exons =\
+            process_filtered_read(currentFRead,dna_flat_files,test)
 
          original_est = currentFRead['seq']
 
-         #alternative_seq = process_remapped_reads(reReads,currentFRead,dna_flat_files)
          alternative_seq = process_map(reReads,currentFRead,dna_flat_files)
 
-         #check_remapped_reads(reReads,currentFRead,dna_flat_files)
-
-         if seq == '':
-            continue
-
-         Sequences.append(seq)
-         Acceptors.append(acc)
-         Donors.append(don)
+         SeqInfo.append(seq_info)
          Exons.append(exons)
-         Ests.append(est)
          OriginalEsts.append(original_est)
-         Qualities.append(qual)
-         UpCut.append(up_cut)
-         StartPos.append(start_pos)
+
+         quality  = currentFRead['prb']
+         cal_prb  = currentFRead['cal_prb']
+         chastity = currentFRead['chastity']
+
+         Qualities.append( (quality,cal_prb,chastity) )
          AlternativeSequences.append(alternative_seq)
 
          instance_counter += 1
@@ -186,274 +168,54 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
          if instance_counter == 20000:
             break
 
-   print 'Full dataset has size %d' % len(Sequences)
-
-   dataset = [Sequences, Acceptors, Donors,\
-   Exons, Ests, OriginalEsts, Qualities,\
-   UpCut, StartPos, AlternativeSequences]
+   print 'Full dataset has size %d' % len(SeqInfo)
 
-   #dataset = {'Sequences':Sequences, 'Acceptors':Acceptors, 'Donors':Donors,\
-   #'Exons':Exons, 'Ests':Ests, 'OriginalEsts':OriginalEsts, 'Qualities':Qualities,\
-   #'AlternativeSequences':AlternativeSequences,\
-   #'AlternativeAcceptors':AlternativeAcceptors,\
-   #'AlternativeDonors':AlternativeDonors}
-
-   #'Ids':Ids, 'FReads':FReads}
+   dataset = [SeqInfo, Exons, OriginalEsts, Qualities,\
+   AlternativeSequences]
 
    # saving new dataset
    #io_pickle.save(dataset_file,dataset)
 
-   #io_pickle.save(dataset_file,dataset)
    cPickle.dump(dataset,open(dataset_file,'w+'),protocol=2)
 
 
-def process_filtered_read(fRead,currentGene,dna_flat_files,test):
+def process_filtered_read(fRead,dna_flat_files,test):
    """
    Creates a training example
 
    """
 
-   quality        = fRead['prb']
-   #quality        = fRead['cal_prb']
-   #quality        = fRead['chastity']
-
-   spos           = fRead['splitpos']
    chr            = fRead['chr']
    strand         = fRead['strand']
 
-   # use matlab-style functions to access dna sequence. we fetch a slice that
-   # is a bit bigger than the genome in case the read starts at the very beginning
-   chrom         = 'chr%d' %  fRead['chr']
-   genomicSeq = load_genomic(chrom,strand,currentGene.start,currentGene.stop,dna_flat_files,one_based=False)
-   genomicSeq = genomicSeq.lower()
-
-   # add a leading 'a' for index reasons
-   genomicSeq = 'a' + genomicSeq
-
-   # check the obtained dna sequence
-   assert genomicSeq != '', 'load_genomic returned empty sequence!'
-   for elem in genomicSeq:
-      assert elem in alphabet, pdb.set_trace()
-
-   if strand == '-':
-      try:
-         genomicSeq = reverse_complement(genomicSeq)
-      except:
-         print 'invalid char'
-         return process_nil
-
-   # fetch the dna fragment that represents the part of the two exons
-   dnaFragment,currentExons,up_cut,down_cut, currentReadSeq = getDNAFragment(currentGene,genomicSeq,fRead)
-
-   if dnaFragment == '':
-      return process_nil
-
-   intervalBegin  = currentGene.start+up_cut-100
-   intervalEnd    = currentGene.start+down_cut+100
-
-   seq_pos_offset = currentGene.start+up_cut
-
-   don, acc       = getSpliceScores(chr,strand,intervalBegin,intervalEnd,dnaFragment,seq_pos_offset )
+   genomic_start  = fRead['p_start']
+   genomic_stop   = fRead['p_stop']
 
-   return dnaFragment, acc, don, currentExons, currentReadSeq, quality, up_cut,\
-   currentGene.start+up_cut
+   CUT_OFFSET = random.randint(Conf.extension[0],Conf.extension[1])
 
-
-def getDNAFragment(currentGene,currentDNASeq,fRead):
-   """
-   This function calculates the dna fragment that corresponds to the two exon
-   parts of the read
-
-   """
-   #print 'entering getDNAFragment...'
-   
-   exon_stop   = fRead['exon_stop']
-   exon_start  = fRead['exon_start']
-
-   currentReadSeq = fRead['seq']
-
-   # assertions to check whether exon start/stop positions are next to donor
-   # resp. acceptor sites
-   exonEnd     = exon_stop - currentGene.start
-   exonBegin   = exon_start - currentGene.start
-
-   current_don = currentDNASeq[exonEnd+1:exonEnd+3]
-   current_acc = currentDNASeq[exonBegin-2:exonBegin]
-
-   if not (current_don == 'gt' or current_don == 'gc'):
-      print 'fragment not aligned'
-      #pydb.debugger()
-      return nil_dna_frag
-
-   if not current_acc == 'ag':
-      print 'fragment not aligned'
-      #pydb.debugger()
-      return nil_dna_frag
-
-   #assert current_don == 'gt' or current_don == 'gc', pdb.set_trace()
-   #assert current_acc == 'ag', pdb.set_trace()
-   # assertions for don/acc sites held now we recalculate offset
-   #assert (exon_stop - p_start) + (p_stop - exon_start) + 2 == Conf.read_size, 'Invalid read size'
-
-   # now we check whether the read sequence contains indels. if so we have to recalculate the boundaries
-   # if not we use p_start and p_stop directly
-   seq_has_indels = False
-   for elem in currentReadSeq:
-      #assert elem in extended_alphabet, 'Invalid char (extended_alphabet) in read: \"%s\"' % elem
-      if not elem in extended_alphabet:
-         print 'incorrect alphabet'
-         return nil_dna_frag
-
-      if elem ==  ']':
-         seq_has_indels = True
-
-   if not seq_has_indels:
-      p_start     = fRead['p_start']
-      p_stop      = fRead['p_stop']
+   if genomic_start <= CUT_OFFSET:
+      up_cut   = genomic_start
    else:
-      spos = fRead['splitpos']
-      # if we have indels we have to take care of the exons boundaries
-      firstReadSeq   = currentReadSeq[:spos]
-      secondReadSeq  = currentReadSeq[spos:]
-
-      est_fragment = []
-      r_idx = 0
-      while True:
-         if not r_idx < len(currentReadSeq):
-            break
-
-         if currentReadSeq[r_idx] == '[':
-            est_fragment.append(currentReadSeq[r_idx+2])
-            r_idx += 4
-            continue
-
-         est_fragment.append(currentReadSeq[r_idx])
-         r_idx += 1
-
-      dna_fragment_1 = []
-      r_idx = 0
-      while True:
-         if not r_idx < len(firstReadSeq):
-            break
-
-         if firstReadSeq[r_idx] == '[':
-            dna_fragment_1.append(firstReadSeq[r_idx+1])
-            r_idx += 4
-            continue
-
-         dna_fragment_1.append(firstReadSeq[r_idx])
-         r_idx += 1
-
-      dna_fragment_2 = []
-      r_idx = 0
-      while True:
-         if not r_idx < len(secondReadSeq):
-            break
-
-         if secondReadSeq[r_idx] == '[':
-            dna_fragment_2.append(secondReadSeq[r_idx+1])
-            r_idx += 4
-            continue
-
-         dna_fragment_2.append(secondReadSeq[r_idx])
-         r_idx += 1
-
-      currentReadSeq = "".join(est_fragment)
-      dna_fragment_1 = "".join(dna_fragment_1)
-      dna_fragment_2 = "".join(dna_fragment_2)
-
-      dna_fragment_1_size = len([p for p in dna_fragment_1 if p != '-'])
-      dna_fragment_2_size = len([p for p in dna_fragment_2 if p != '-'])
+      up_cut   = genomic_start - CUT_OFFSET
 
-      # new check the fragment of the annotation
-      new_exon1_size = dna_fragment_1_size
-      p_start        = exon_stop - new_exon1_size + 1
+   #if genomic_stop + CUT_OFFSET >  :
+   #   down_cut = currentGene.stop - currentExons[1,1]
+   #else:
+   down_cut = genomic_stop + CUT_OFFSET
 
-      dna_annot_1    = currentDNASeq[p_start-currentGene.start:exon_stop+1-currentGene.start]
-
-      # new check the fragment of the annotation
-      new_exon2_size = dna_fragment_2_size 
-      p_stop         = exon_start + new_exon2_size - 1
-
-      dna_annot_2    = currentDNASeq[exon_start-currentGene.start:p_stop+1-currentGene.start]
-
-      #assert dna_fragment_1.replace('-','') == dna_annot_1, pdb.set_trace()
-      #assert dna_fragment_2.replace('-','') == dna_annot_2, pdb.set_trace()
-
-      if not dna_fragment_1.replace('-','') == dna_annot_1:
-         print 'genomic seq mismatch'
-         #pydb.debugger()
-         print '1st/2nd read:\t %s %s' % (firstReadSeq,secondReadSeq)
-         print 'dna_fragment_1:\t %s' % dna_fragment_1
-         print 'dna_annot 1+2:\t %s %s' % (dna_annot_1,dna_annot_2)
-         return nil_dna_frag
-
-      if not dna_fragment_2.replace('-','') == dna_annot_2:
-         print 'genomic seq mismatch'
-         #pydb.debugger()
-         print '1st/2nd read:\t %s %s' % (firstReadSeq,secondReadSeq)
-         print 'dna_fragment_2:\t %s' % dna_fragment_2
-         print 'dna_annot 1+2:\t %s %s' % (dna_annot_1,dna_annot_2)
-         return nil_dna_frag
-
-      for elem in currentReadSeq:
-         assert elem in alphabet, 'Invalid char (alphabet) in read: \"%s\"' % elem
-
-   assert p_stop - p_start >= Conf.read_size
+   seq_info = (chr,strand,up_cut,down_cut)
 
    # now that we have p_start and p_stop properly 
    # we can define the correct exons borders
    currentExons = zeros((2,2),dtype=numpy.int)
 
-   currentExons[0,0] = p_start
-   currentExons[0,1] = exon_stop
-
-   currentExons[1,0] = exon_start
-   currentExons[1,1] = p_stop
-
-   currentExons -= currentGene.start
-
-   # determine cutting positions for sequences
-   # we cut out a slightly bigger region as we want to learn the read start and
-   # stop positions as well
-   ADD_POS = random.randint(Conf.extension[0],Conf.extension[1])
-
-   if currentExons[0,0] <= ADD_POS:
-      up_cut   = currentExons[0,0]
-   else:
-      up_cut   = currentExons[0,0] - ADD_POS
-
-   if currentGene.stop - currentExons[1,1] <= ADD_POS:
-      down_cut = currentGene.stop - currentExons[1,1]
-   else:
-      down_cut = currentExons[1,1] + ADD_POS
+   currentExons[0,0] = fRead['p_start']
+   currentExons[0,1] = fRead['exon_stop']
 
-   # make exox positions start relative to dna sequence start
-   currentExons -= up_cut
+   currentExons[1,0] = fRead['exon_start']
+   currentExons[1,1] = fRead['p_stop']
 
-   # cut out part of the sequence
-   currentDNASeq = currentDNASeq[up_cut:down_cut]
-
-   # ensure that index is correct
-   currentExons[0,1] += 1
-   currentExons[1,1] += 1
-
-   try:
-      if not (currentDNASeq[int(currentExons[0,1])] == 'g' and currentDNASeq[int(currentExons[0,1])+1] in ['c','t' ]):
-         print 'not aligned'
-         return nil_dna_frag
-
-      if not (currentDNASeq[int(currentExons[1,0])-1] == 'g' and currentDNASeq[int(currentExons[1,0])-2] == 'a'):
-         print 'not aligned'
-         return nil_dna_frag
-   except:
-      print 'error with exon boundaries'
-      return nil_dna_frag
-
-   #print 'leaving getDNAFragment...'
-
-   # end of exons/read borders calculation
-   return currentDNASeq, currentExons, up_cut, down_cut, currentReadSeq
+   return seq_info, currentExons
 
 
 def getSpliceScores(chr,strand,intervalBegin,intervalEnd,currentDNASeq,seq_pos_offset):
@@ -616,51 +378,6 @@ def process_map(reReads,fRead,dna_flat_files):
    return alternativeSeq
 
 
-def process_remapped_reads(reReads,fRead,dna_flat_files):
-   """
-   For all matches found by Vmatch we calculate the fragment of the DNA we
-   would like to perform an aligment during prediction.
-   """
-
-   fragment_size = 3000
-
-   seq = fRead['seq']
-   strand = fRead['strand']
-   pos = fRead['p_start']
-   chr = fRead['chr']
-
-   alternativeSeq = []
-
-   for currentRRead in reReads:
-      rId            = currentRRead['id']
-
-      pos1           = currentRRead['pos1']
-      chr1           = currentRRead['chr1']
-      seq1           = currentRRead['seq1']
-
-      s1_pos = seq.find(seq1)
-
-      seq2           = currentRRead['seq2']
-
-      if not seq2 == '':
-         pos2           = currentRRead['pos2']
-         chr2           = currentRRead['chr2']
-         s2_pos = seq.find(seq2)
-   
-      # vmatch found a correct position
-      if pos + s1_pos == pos1:
-         genomicSeq_start  = pos1
-         genomicSeq_stop   = pos1 + fragment_size
-         currentLabel = True
-      else:
-         genomicSeq_start  = pos1
-         genomicSeq_stop   = pos1 + fragment_size
-         currentLabel = False
-         
-      alternativeSeq.append( (chr,strand,genomicSeq_start,genomicSeq_stop,currentLabel) )
-
-   return alternativeSeq
-
 def check_remapped_reads(reReads,fRead,dna_flat_files):
    """
    For all matches found by Vmatch we calculate the fragment of the DNA we
@@ -732,6 +449,42 @@ def reverse_complement(seq):
    return new_seq
 
 
+def unbracket_est(est):
+   new_est = ''
+   e = 0
+   while True:
+      if not e < len(est):
+         break
+
+      if est[e] == '[':
+         new_est += est[e+2]
+         e += 4
+      else:
+         new_est += est[e]
+         e += 1
+
+   return "".join(new_est).lower()
+
+
+def get_seq(begin,end,exon_end):
+
+   dna_flat_files    =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+
+   if exon_end:
+      gene_start = begin
+      gene_stop  = end+2
+   else:
+      gene_start = begin-2
+      gene_stop  = end
+
+   chrom         = 'chr%d' % 1
+   strand        = '+'
+
+   genomicSeq = load_genomic(chrom,strand,gene_start,gene_stop,dna_flat_files,one_based=False)
+   genomicSeq = genomicSeq.lower()
+
+   return genomicSeq
+
 if __name__ == '__main__':
    if len(sys.argv) == 1:
       print info
index e52984e..5eee532 100644 (file)
@@ -630,7 +630,6 @@ class QPalma:
       Donors      = self.Donors[beg:end]
       UpCut       = self.UpCut[beg:end]
       StartPos    = self.StartPos[beg:end]
-      #SplitPos    = self.SplitPositions[beg:end]
 
       AlternativeSequences = self.AlternativeSequences[beg:end]
 
@@ -678,22 +677,8 @@ class QPalma:
          dna = Sequences[exampleIdx] 
          est = Ests[exampleIdx] 
 
-         new_est = ''
-         e = 0
-         while True:
-            if not e < len(est):
-               break
+         new_est = unbracket_est(est)
 
-            if est[e] == '[':
-               new_est += est[e+2]
-               e += 4
-            else:
-               new_est += est[e]
-               e += 1
-
-         est = new_est
-         est = "".join(est)
-         est = est.lower()
 
          exons = Exons[exampleIdx]