+ modified filtering of reads
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Mon, 4 Feb 2008 10:11:34 +0000 (10:11 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Mon, 4 Feb 2008 10:11:34 +0000 (10:11 +0000)
+ extended dataset generation script
+ rearranged loading functions for qpalma training

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7678 e1793c9e-67f9-0310-80fc-b846ff1f7b36

qpalma/Configuration.py
qpalma/DataProc.py
qpalma/parsers.py
scripts/compile_dataset.py
scripts/qpalma_main.py
tools/data_tools/filterReads.c

index e613511..a4c2876 100644 (file)
@@ -436,6 +436,9 @@ annot_path     = joinp(data_path,'annotation_data')
 original_path  = joinp(data_path,'original_solexa_data')
 
 joinp(annot_path,'TAIR7_GFF3_genes_Chr1.gff_v1')
+
+
+data_filename = ''
 ###############################################################################
 #
 # SANITY CHECKS
index 5d02c6b..8db88d3 100644 (file)
@@ -13,6 +13,20 @@ import sys
 
 from genome_utils import load_genomic
 
+def paths_load_data(filename,expt,genome_info,PAR,test=False):
+
+   data = io_pickle.load(filename)
+
+   Sequences   = data['Sequences']
+   Acceptors   = data['Acceptors']
+   Donors      = data['Donors']
+   Exons       = data['Exons']
+   Ests        = data['Ests']
+   Qualities   = data['Qualities']
+   SplitPositions = data['SplitPositions']
+
+   return Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPositions
+
 def paths_load_data_solexa(expt,genome_info,PAR,test=False):
    """
 
@@ -254,63 +268,3 @@ def paths_load_data_pickle(expt,genome_info,PAR):
    Exons -= 1
 
    return Sequences, Acceptors, Donors, Exons, Ests, Noises
-
-def paths_load_data(expt,genome_info,PAR):
-   """
-      
-   """
-
-   # function [Sequences, Acceptors, Donors, Exons, Ests, Noises] = paths_load_data(expt,genome_info,PAR)
-   # Load the relevant file and return the alignment data
-
-   # expt can be 'training','validation' or 'test'
-
-   assert expt in ['training','validation','test']
-
-   tmp_dir = '/fml/ag-raetsch/home/fabio/tmp'
-
-   Noises = [];
-
-   if expt == 'training':
-      if PAR.microexon:
-         if PAR.LOCAL_ALIGN: # local version
-
-            train_data = '%s/microexon_train_data_cut_local.mat' % genome_info.basedir
-            train_data = '%s/microexon_train_data.mat' % genome_info.basedir
-            #train_data_pickle = '%s/microexon_train_data_cut_local.pickle'% tmp_dir
-            #io_pickle.convert_v6(train_data,train_data_pickle)
-            #train_data = io_pickle.load(train_data_pickle)
-            data = scipy.io.loadmat(train_data)
-
-         else: # global version
-
-            train_data = '%s/microexon_train_data_cut_ip=%1.3f_dp=%1.3f_mp=%1.3f.mat' %\
-               (genome_info.basedir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob)
-
-            train_data = '%s/microexon_train_data.mat' % genome_info.basedir
-            #train_data_pickle = '%s/microexon_train_data_cut_ip=%1.3f_dp=%1.3f_mp=%1.3f.pickle' %\
-            #   (tmp_dir, PAR.insertion_prob, PAR.deletion_prob, PAR.mutation_prob)
-
-            #io_pickle.convert_v6(train_data,train_data_pickle)
-            #train_data = io_pickle.load(train_data_pickle)
-            data = scipy.io.loadmat(train_data)
-            Noises = data['TrainNoise'] # substitution matrix
-
-      else:
-         train_data = '%s/exons_train_local.mat' % genome_info.basedir
-         #train_data_pickle = '%s/exons_train_local.pickle'% tmp_dir
-         #io_pickle.convert_v6(train_data,train_data_pickle)
-         #microexon_train_data = io_pickle.load(train_data_pickle)
-         data = scipy.io.loadmat(train_data)
-
-      #print 'train_data is %s' % train_data
-
-      Sequences = data['Train']       # dna sequences
-      Acceptors = data['TrainAcc']    # acceptor scores
-      Donors    = data['TrainDon']    # donor scores
-      Exons     = data['TrainExon']   # exon boundaries
-      Ests      = data['TrainEsts']   # est sequences
-     
-   Exons -= 1
-
-   return Sequences, Acceptors, Donors, Exons, Ests, Noises
index 072debd..81ec5b7 100644 (file)
@@ -15,6 +15,13 @@ class ReadParser:
    def next(self):
       pass
 
+   def parse(self):
+      lines = []
+
+      for elem in self:
+         lines.append(elem)
+         
+      return lines
 
 class FilteredReadParser(ReadParser):
    """
@@ -27,15 +34,33 @@ class FilteredReadParser(ReadParser):
 
    def parseLine(self,line):
       """
-      We assume that a line has the following entires:
+      We assume that a line has the following entries:
+      
+      read_nr,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop
 
       """
-      return id,chr,pos,strand,mismatches,align_len,offset,alignment = line.split()
+      id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
+      splitpos = int(splitpos)
+      read_size = int(read_size)
+
+      prb = [ord(elem)-50 for elem in prb]
+      cal = [ord(elem)-64 for elem in cal]
+      chastity = [ord(elem)+10 for elem in chastity]
+
+      p_start = int(p_start)
+      exons_stop = int(exon_stop)
+      exon_start = int(exon_start)
+      p_stop = int(p_stop)
+
+      line_d = {'id':id, 'chr':chr, 'strand':strand, 'seq':seq, 'splitpos':splitpos,\
+      'read_size':read_size, 'prb':prb, 'cal_prb':cal_prb, 'chastity':chastity, 'gene_id':gene_id,\
+      'p_start':p_start, 'exon_stop':exon_stop, 'exon_start':exon_start, 'p_stop':p_stop}
+      return line_d
 
    def next(self):
       for line in self.fh:
          line = line.strip()
-          yield self.parseLine
+          yield self.parseLine(line)
 
       raise StopIteration
 
@@ -44,6 +69,11 @@ class RemappedReadParser(ReadParser):
    """
    This class offers a parser for the reads that are remapped by the vmatch
    utility. 
+
+   According to the docu the entries are:
+   
+   ID, Chromosome, Position, Orientation (D or P), Mismatches, Alignment length, Offset, Alignment
+
    """
 
    def __init__(self,filename):
@@ -54,11 +84,19 @@ class RemappedReadParser(ReadParser):
       We assume that a line has the following entires:
 
       """
-      return id,chr,pos,strand,mismatches,align_len,offset,alignment = line.split()
+      id,chr,pos,strand,mismatches,align_len,offset,alignment = line.split()
+      pos = int(pos)
+      mismatches = int(mismatches)
+      align_len = int(align_len)
+      offset = int(offset)
+      line_d = {'id':id, 'chr':chr, 'pos':pos, 'strand':strand, 'mismatches':mismatches, 'align_len':align_len,\
+      'offset':offset, 'alignment':alignment}
+      return line_d
 
    def next(self):
       for line in self.fh:
          line = line.strip()
-          yield self.parseLine
+          yield self.parseLine(line)
 
       raise StopIteration
+
index 88596e8..51c5d9d 100644 (file)
@@ -45,7 +45,7 @@ info = """
    - 'remappped_pos' - the position of the remapped read
    """
 
-def compile_d(gff_file,dna_flat_files,solexa_reads,remapped_reads,tmp_dir,dataset_file):
+def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,dataset_file):
 
    assert os.path.exists(gff_file)
    for file in dna_flat_files:
@@ -64,6 +64,13 @@ def compile_d(gff_file,dna_flat_files,solexa_reads,remapped_reads,tmp_dir,datase
    dna_filename = Conf.dna_filename
    est_filename = Conf.est_filename
 
+   frp = FilteredReadParser(filtered_reads)
+   all_filtered_reads = frp.parse()
+
+   rrp = RemappedReadParser(remapped_reads)
+   all_remapped_reads = rrp.parse()
+
+   # this stores the new dataset
    Sequences = []
    Acceptors = []
    Donors = []
@@ -71,199 +78,182 @@ def compile_d(gff_file,dna_flat_files,solexa_reads,remapped_reads,tmp_dir,datase
    Ests = []
    Qualities = []
    SplitPositions = []
+   
+   # Iterate over all remapped reads in order to generate for each read an
+   # training / prediction example
+   for reRead in all_remapped_reads:
+      currentId = reRead['id']
+      fRead = all_filtered_reads[currentId]
+      currentGene = allGenes[gene_id]
 
-   frp = FilteredReadParser('')
-   rrp = RemappedReadParser('est_filename')
+      chrom             = 'chr1'
+      genome_file       = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+      sscore_filename   = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_1+' 
 
+      seq, acc, don, exons, est, qual, spos = process_read(reRead,fRead,currentGene,chrom,genome_file,sscore_filename)
+      
+      if newExample == None:
+         continue
 
-   # Iterate over all reads
-   for line in open(est_filename):
-      line = line.strip()
-      #chr,strand,seq,splitpos,length,prb,cal,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
-      #chr,strand,seq,splitpos,length,prb,cal,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
-      id,chr,pos,strand,mismatches,align_len,offset,alignment = line.split()
+      Sequences.append(seq)
+      Acceptors.append(acc)
+      Donors.append(don)
+      Exons.append(exons)
+      Ests.append(est)
+      Qualities.append(qual)
+      SplitPositions.append(spos)
 
-      splitpos = int(splitpos)
-      length = int(length)
-      prb = [ord(elem)-50 for elem in prb]
-      cal = [ord(elem)-64 for elem in cal]
-      chastity = [ord(elem)+10 for elem in chastity]
 
-      assert len(prb) == len(seq)
+   dataset = {'Sequences':Sequences, 'Acceptors':Acceptors, 'Donors':Donors\
+   'Exons':Exons, 'Ests':Ests, 'Qualities':Qualities, 'SplitPositions':SplitPositions}
+   # saving new dataset
+   io_pickle.save(dataset_file,dataset)
 
-      currentGene = allGenes[gene_id]
-      seq = seq.lower()
 
-      #try:
-      #   currentSeq = tair7_seq[gene_id+'.1']['sequence'].lower()
-      #except:
-      #   continue
+def process_read(reRead,fRead,currentGene,chrom,genome_file,sscore_filename):
+   """
+   The purpose of this function is to create an example for QPalma 
+   by using a 
 
-      genome_file = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
-      chrom    = 'chr1'
+   """
 
-      # use matlab-style functions to access dna sequence
-      dna = load_genomic(chrom,'+',currentGene.start,currentGene.stop,genome_file,one_based=True)
-      currentSeq = dna
+   # use matlab-style functions to access dna sequence
+   dna = load_genomic(chrom,'+',currentGene.start,currentGene.stop,genome_file,one_based=True)
+   dna = dna.lower()
+   currentSeq = dna
 
-      if len(currentSeq) < (currentGene.stop - currentGene.start):
-         continue
+   if len(currentSeq) < (currentGene.stop - currentGene.start):
+      return None
 
-      # compare both sequences
-      #assert dna == currentSeq, pdb.set_trace()
-   
-      p_start = int(p_start)
-      exon_stop = int(exon_stop)
-      exon_start = int(exon_start)
-      p_stop = int(p_stop)
-
-      assert p_start < exon_stop < exon_start < p_stop, 'Invalid Indices'
-      assert exon_stop - p_start + p_stop - exon_start == 36, 'Invalid read size'
-      assert p_stop - p_start >= 36
-   
-      currentExons = zeros((2,2))
+   p_start = fRead['p_start']
+   exon_stop = fRead['exon_stop']
+   exon_start = fRead['exon_start']
+   p_stop = fRead['p_stop']
 
-      currentExons[0,0] = p_start
-      currentExons[0,1] = exon_stop
+   currentSeq = fRead['seq']
 
-      currentExons[1,0] = exon_start
-      currentExons[1,1] = p_stop
+   assert p_start < exon_stop < exon_start < p_stop, 'Invalid Indices'
+   assert exon_stop - p_start + p_stop - exon_start == 36, 'Invalid read size'
+   assert p_stop - p_start >= 36
 
-      # make indices relative to start pos of the current gene
-      currentExons -= currentGene.start
+   currentExons = zeros((2,2))
 
-      # determine cutting positions for sequences
-      up_cut   = int(currentExons[0,0])
-      down_cut = int(currentExons[1,1])
+   currentExons[0,0] = p_start
+   currentExons[0,1] = exon_stop
 
-      # if we perform testing we cut a much wider region as we want to detect
-      # how good we perform on a region
-      if test:
-         up_cut = up_cut-500
-         if up_cut < 0:
-            up_cut = 0
+   currentExons[1,0] = exon_start
+   currentExons[1,1] = p_stop
 
-         down_cut = down_cut+500
+   # make indices relative to start pos of the current gene
+   currentExons -= currentGene.start
 
-         if down_cut > len(currentSeq):
-            down_cut = len(currentSeq)
+   # determine cutting positions for sequences
+   up_cut   = int(currentExons[0,0])
+   down_cut = int(currentExons[1,1])
 
-      # cut out part of the sequence
-      currentSeq = currentSeq[up_cut:down_cut]
+   # if we perform testing we cut a much wider region as we want to detect
+   # how good we perform on a region
+   if test:
+      up_cut = up_cut-500
+      if up_cut < 0:
+         up_cut = 0
 
-      # ensure that index is correct
-      currentExons[0,1] += 1
+      down_cut = down_cut+500
 
-      if test:
-         currentExons -= up_cut
-      else:
-         currentExons -= currentExons[0,0]
+      if down_cut > len(currentSeq):
+         down_cut = len(currentSeq)
 
-      try:
-         if not (currentSeq[int(currentExons[0,1])] == 'g' and\
-         currentSeq[int(currentExons[0,1])+1] in ['c','t' ]):
-            continue
+   # cut out part of the sequence
+   currentSeq = currentSeq[up_cut:down_cut]
 
-         if not (currentSeq[int(currentExons[1,0])-1] == 'g' and currentSeq[int(currentExons[1,0])-2] == 'a'):
-            continue
+   # ensure that index is correct
+   currentExons[0,1] += 1
 
-      except:
-         pdb.set_trace()
+   if test:
+      currentExons -= up_cut
+   else:
+      currentExons -= currentExons[0,0]
 
-      Sequences.append(currentSeq)
+   try:
+      if not (currentSeq[int(currentExons[0,1])] == 'g' and\
+      currentSeq[int(currentExons[0,1])+1] in ['c','t' ]):
+         continue
 
-      # now we want to use interval_query to get the predicted splice scores
-      # trained on the TAIR sequence and annotation
+      if not (currentSeq[int(currentExons[1,0])-1] == 'g' and currentSeq[int(currentExons[1,0])-2] == 'a'):
+         continue
 
-      from Genefinding import *
-      
-      interval_matrix = createIntArrayFromList([currentGene.start+up_cut,currentGene.start+down_cut])
-      num_fields = 1
-      num_intervals = 1
-
-      # fetch acceptor scores
-      fname =  '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/acc/output_SVMWD/pred/contig_1+' 
-      gf = Genef()
-      num_hits = gf.query(fname, ['Conf'], num_fields, interval_matrix, num_intervals)
-      assert num_hits <= len(currentSeq)
-      
-      #print 'Acceptor hits: %d' % num_hits
+   except:
+      pdb.set_trace()
 
-      pos = createIntArrayFromList([0]*num_hits)
-      indices = createIntArrayFromList([0]*num_hits)
-      scores = createDoubleArrayFromList([0]*num_hits*num_fields)
-      gf.getResults(pos,indices,scores)
+   # now we want to use interval_query to get the predicted splice scores
+   # trained on the TAIR sequence and annotation
+   from Genefinding import *
+   
+   interval_matrix = createIntArrayFromList([currentGene.start+up_cut,currentGene.start+down_cut])
+   num_fields = 1
+   num_intervals = 1
+
+   # fetch acceptor scores
+   gf = Genef()
+   num_hits = gf.query(sscore_filename, ['Conf'], num_fields, interval_matrix, num_intervals)
+   assert num_hits <= len(currentSeq)
+   
+   #print 'Acceptor hits: %d' % num_hits
 
-      currentAcceptors  = [-inf]*(len(currentSeq)+1)
-      
-      for i in range(num_hits):
+   pos = createIntArrayFromList([0]*num_hits)
+   indices = createIntArrayFromList([0]*num_hits)
+   scores = createDoubleArrayFromList([0]*num_hits*num_fields)
+   gf.getResults(pos,indices,scores)
+
+   acc  = [-inf]*(len(currentSeq)+1)
+   
+   for i in range(num_hits):
+      position = pos[i] 
+      position -= (currentGene.start+up_cut)
+      assert 0 <= position < len(currentSeq)+1, 'position index wrong'
+      acc[position] = scores[i]
+
+   acc = acc[1:] + [-inf]
+   Acceptors.append(acc)
+
+   del gf
+
+   # fetch donor scores
+   gf = Genef()
+   num_hits = gf.query(sscore_filename, ['Conf'], num_fields, interval_matrix, num_intervals)
+   assert num_hits <= len(currentSeq)
+
+   #print 'Donor hits: %d' % num_hits
+   pos = createIntArrayFromList([0]*num_hits)
+   indices = createIntArrayFromList([0]*num_hits)
+   scores = createDoubleArrayFromList([0]*num_hits*num_fields)
+   gf.getResults(pos,indices,scores)
+
+   don     = [-inf]*(len(currentSeq))
+
+   try:
+      for i in range(1,num_hits):
          position = pos[i] 
          position -= (currentGene.start+up_cut)
-         assert 0 <= position < len(currentSeq)+1, 'position index wrong'
-         currentAcceptors[position] = scores[i]
-
-      currentAcceptors = currentAcceptors[1:] + [-inf]
-      Acceptors.append(currentAcceptors)
-
-      del gf
-
-      # fetch donor scores
-      fname =  '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_1+' 
-      gf = Genef()
-      num_hits = gf.query(fname, ['Conf'], num_fields, interval_matrix, num_intervals)
-      assert num_hits <= len(currentSeq)
-
-      #print 'Donor hits: %d' % num_hits
-      pos = createIntArrayFromList([0]*num_hits)
-      indices = createIntArrayFromList([0]*num_hits)
-      scores = createDoubleArrayFromList([0]*num_hits*num_fields)
-      gf.getResults(pos,indices,scores)
-
-      currentDonors     = [-inf]*(len(currentSeq))
-
-      try:
-         for i in range(1,num_hits):
-            position = pos[i] 
-            position -= (currentGene.start+up_cut)
-            currentDonors[position-1] = scores[i]
-      except:
-         pdb.set_trace()
-
-      currentDonors = [-inf] + currentDonors[:-1]
-      Donors.append(currentDonors)
-
-      Exons.append(currentExons)
-      Ests.append(seq)
-      Qualities.append(prb)
-
-      SplitPositions.append(splitpos)
-
-      #if len(Sequences[-1]) == 2755:
-      #   Sequences = Sequences[:-1]
-      #   Acceptors = Acceptors[:-1]
-      #   Donors    = Donors[:-1]
-      #   Exons     = Exons[:-1]
-      #   Ests      = Ests[:-1]
-      #   Qualities = Qualities[:-1]
-      #   SplitPositions = SplitPositions[:-1]
-   
-   #print 'found %d examples' % len(Sequences)
+         don[position-1] = scores[i]
+   except:
+      pdb.set_trace()
 
-   return Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPositions
+   don = [-inf] + currentDonors[:-1]
 
+   return seq, acc, don, currentExons, est, qual, spos
 
-      
 
-   dataset = []
+def reverse_complement(seq):
+   map {'a':'t','c':'g','g':'c','t':'a'}
 
-   {'id':'', 'chr':'', '', 'strand':,  = line.split()
+   new_seq = [map[elem] for elem in seq]
+   new_seq.reverse()
 
+   return new_seq
    
-   io_pickle.save(dataset_file,dataset)
-
 
 if __name__ == '__main__':
-
    if len(sys.argv) == 1:
       print info
 
index ed94ab1..f5438df 100644 (file)
@@ -65,8 +65,10 @@ class QPalma:
 
       elif Conf.mode == 'using_quality_scores':
 
-         filename = 'real_dset_%s'% 'recent'
-         Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data_solexa('training',None,self.ARGS)
+         #Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data_solexa('training',None,self.ARGS)
+         data_filename = Conf.data_filename
+
+         Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPos = paths_load_data(data_filename,'training',None,self.ARGS)
 
          #end = 1000
          self.Sequences   = Sequences
@@ -74,8 +76,8 @@ class QPalma:
          self.Ests        = Ests
          self.Qualities   = Qualities
          self.SplitPos    = SplitPos
-
-         self.Donors, self.Acceptors = getDonAccScores(Sequences)
+         self.Donors      = Donors
+         self.Acceptors   = Acceptors
 
          #self.Acceptors   = self.Acceptors[:end]
          #self.Donors      = self.Donors[:end]
index 55730cf..ece462a 100644 (file)
@@ -46,6 +46,7 @@ const int read_size = 36;
 const int min_overlap = 6;
 const int max_overlap = 30;
 
+int read_nr = 1;
 
 int combined_reads = 0;
 int main(int argc, char* argv[]) {
@@ -460,8 +461,9 @@ void combine_info(int exon_stop, int exon_start, void** upstream, int up_size, v
          // exons_stop  : the position in the dna where the first exons ends
          // exons_start : the position in the dna where the second exons starts
          // p_stop  : the position in the dna where the (truncated) second read ends
-         fprintf(out_fs,"%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n",
-         new_chr,new_strand,new_seq,splitpos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop);
+         fprintf(out_fs,"%08d\t%d\t%c\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\n",
+         read_nr,new_chr,new_strand,new_seq,splitpos,read_size,new_prb,new_cal_prb,new_chastity,gene_id,p_start,exon_stop,exon_start,p_stop);
+         read_nr++;
 
          combined_reads++;
          used_flag[down_idx] = 1;