+ added basic scripts to generate runs for an experiment
[qpalma.git] / scripts / compile_dataset.py
index 88596e8..cf79622 100644 (file)
@@ -2,14 +2,33 @@
 # -*- coding: utf-8 -*-
 
 import sys
+import os
 import pdb
 import io_pickle
 
+import numpy
+from numpy.matlib import mat,zeros,ones,inf
+
+import qpalma
 import qpalma.tools
 from qpalma.tools.parseGff import parse_gff
 
 from qpalma.parsers import FilteredReadParser, RemappedReadParser
 
+from Genefinding import *
+
+from genome_utils import load_genomic
+
+import qpalma.Configuration as Conf
+
+warning = """
+   #########################################################
+   WARNING ! Disabled the use of non-bracket reads WARNING !
+   #########################################################
+   
+   """
+
+
 help = """
 
    Usage of this script is:
@@ -45,13 +64,9 @@ info = """
    - 'remappped_pos' - the position of the remapped read
    """
 
-def compile_d(gff_file,dna_flat_files,solexa_reads,remapped_reads,tmp_dir,dataset_file):
-
-   assert os.path.exists(gff_file)
-   for file in dna_flat_files:
-      assert os.path.exists(file)
+def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,dataset_file,test):
 
-   assert os.path.exists(solexa_reads)
+   assert os.path.exists(filtered_reads)
    assert os.path.exists(remapped_reads)
 
    assert not os.path.exists(dataset_file), 'The data_file already exists!'
@@ -59,214 +74,429 @@ def compile_d(gff_file,dna_flat_files,solexa_reads,remapped_reads,tmp_dir,datase
    joinp = os.path.join
 
    # first read the gff file(s) and create pickled gene information
-   allGenes = parse_gff(gff_file,joinp(tmp_dir,'gff_info.pickle'))
-   
-   dna_filename = Conf.dna_filename
-   est_filename = Conf.est_filename
+   allGenes = [None]*6
+
+   for i in range(1,6):
+      allGenes[i]= parse_gff(gff_file%i) #,joinp(tmp_dir,'gff_info.pickle'))
+
+   print 'parsing filtered reads..'
+   frp = FilteredReadParser(filtered_reads)
+   all_filtered_reads = frp.parse()
+   print 'found %d filtered reads' % len(all_filtered_reads)
+
+   print 'parsing remapped reads'
+   rrp = RemappedReadParser(remapped_reads)
+   all_remapped_reads = rrp.parse()
+   if all_remapped_reads != None:
+      size_rem = len(all_remapped_reads)
+   else:
+      size_rem = 0
 
+   print 'found %d remapped reads' % size_rem
+
+   # this stores the new dataset
    Sequences = []
    Acceptors = []
    Donors = []
    Exons = []
    Ests = []
+   OriginalEsts = []
    Qualities = []
    SplitPositions = []
+   Ids = []
 
-   frp = FilteredReadParser('')
-   rrp = RemappedReadParser('est_filename')
+   #pdb.set_trace()
+   
+   # Iterate over all remapped reads in order to generate for each read an
+   # training / prediction example
+   instance_counter = 0
 
+   for id,currentFRead in all_filtered_reads.items():
 
-   # Iterate over all reads
-   for line in open(est_filename):
-      line = line.strip()
-      #chr,strand,seq,splitpos,length,prb,cal,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
-      #chr,strand,seq,splitpos,length,prb,cal,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
-      id,chr,pos,strand,mismatches,align_len,offset,alignment = line.split()
+      try:
+         currentRReads = all_remapped_reads[id]
+      except:
+         currentRReads = None
 
-      splitpos = int(splitpos)
-      length = int(length)
-      prb = [ord(elem)-50 for elem in prb]
-      cal = [ord(elem)-64 for elem in cal]
-      chastity = [ord(elem)+10 for elem in chastity]
+      try:
+         gene_id = currentFRead['gene_id']
+         chro = currentFRead['chr']
+         currentGene = allGenes[chro][gene_id]
+      except:
+         pdb.set_trace()
 
-      assert len(prb) == len(seq)
+      if currentFRead['strand'] != '+':
+         #print 'wrong strand'
+         continue
 
-      currentGene = allGenes[gene_id]
-      seq = seq.lower()
+      seq, acc, don, exons, est, original_est, qual, spos = process_read(currentRReads,currentFRead,currentGene,dna_flat_files,test)
+      
+      if seq == '':
+         continue
 
-      #try:
-      #   currentSeq = tair7_seq[gene_id+'.1']['sequence'].lower()
-      #except:
-      #   continue
+      Sequences.append(seq)
+      Acceptors.append(acc)
+      Donors.append(don)
+      Exons.append(exons)
+      Ests.append(est)
+      OriginalEsts.append(original_est)
+      Qualities.append(qual)
+      SplitPositions.append(spos)
+      Ids.append(id)
 
-      genome_file = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
-      chrom    = 'chr1'
+      instance_counter += 1
 
-      # use matlab-style functions to access dna sequence
-      dna = load_genomic(chrom,'+',currentGene.start,currentGene.stop,genome_file,one_based=True)
-      currentSeq = dna
+      if instance_counter % 100 == 0:
+         print 'processed %d examples' % instance_counter
 
-      if len(currentSeq) < (currentGene.stop - currentGene.start):
-         continue
+      if instance_counter == 20000:
+         break
 
-      # compare both sequences
-      #assert dna == currentSeq, pdb.set_trace()
-   
-      p_start = int(p_start)
-      exon_stop = int(exon_stop)
-      exon_start = int(exon_start)
-      p_stop = int(p_stop)
-
-      assert p_start < exon_stop < exon_start < p_stop, 'Invalid Indices'
-      assert exon_stop - p_start + p_stop - exon_start == 36, 'Invalid read size'
-      assert p_stop - p_start >= 36
-   
-      currentExons = zeros((2,2))
+   print 'Full dataset has size %d' % len(Sequences)
 
-      currentExons[0,0] = p_start
-      currentExons[0,1] = exon_stop
+   dataset = {'Sequences':Sequences, 'Acceptors':Acceptors, 'Donors':Donors,\
+   'Exons':Exons, 'Ests':Ests, 'OriginalEsts':OriginalEsts, 'Qualities':Qualities,\
+   'SplitPositions':SplitPositions,'Ids':Ids}
 
-      currentExons[1,0] = exon_start
-      currentExons[1,1] = p_stop
+   # saving new dataset
+   io_pickle.save(dataset_file,dataset)
+
+
+def process_read(reReads,fRead,currentGene,dna_flat_files,test):
+   """
+   The purpose of this function is to create an example for QPalma 
+   by using a 
+
+   """
+   nil = ('','','','','','','','')
+   extended_alphabet = ['-','a','c','g','t','n','[',']']
+   alphabet = ['-','a','c','g','t','n']
+
+   chr            = fRead['chr']
+   strand         = fRead['strand']
+   quality        = fRead['prb']
+   #quality        = fRead['cal_prb']
+   #quality        = fRead['chastity']
+   spos           = fRead['splitpos']
+   currentReadSeq = fRead['seq']
+   currentReadSeq = currentReadSeq.lower()
 
-      # make indices relative to start pos of the current gene
-      currentExons -= currentGene.start
+   #if currentReadSeq.find('[') == -1:
+   #   print warning
+   #   return nil
 
-      # determine cutting positions for sequences
-      up_cut   = int(currentExons[0,0])
-      down_cut = int(currentExons[1,1])
+   originalReadSeq = fRead['seq']
 
-      # if we perform testing we cut a much wider region as we want to detect
-      # how good we perform on a region
-      if test:
-         up_cut = up_cut-500
-         if up_cut < 0:
-            up_cut = 0
+   chrom             = 'chr%d' % chr
+   sscore_filename   = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_%d%s' % (chr,strand)
 
-         down_cut = down_cut+500
+   # use matlab-style functions to access dna sequence
+   currentDNASeq = load_genomic(chrom,strand,currentGene.start,currentGene.stop,dna_flat_files,one_based=False)
+   currentDNASeq = currentDNASeq.lower()
 
-         if down_cut > len(currentSeq):
-            down_cut = len(currentSeq)
+   assert currentDNASeq != '', 'load_genomic returned empty sequence!'
 
-      # cut out part of the sequence
-      currentSeq = currentSeq[up_cut:down_cut]
+   seq_has_indels = False
+   for elem in currentReadSeq:
+      #assert elem in extended_alphabet, 'Invalid char (extended_alphabet) in read: \"%s\"' % elem
+      if not elem in extended_alphabet:
+         return nil
 
-      # ensure that index is correct
-      currentExons[0,1] += 1
+      if elem ==  ']':
+         seq_has_indels = True
 
-      if test:
-         currentExons -= up_cut
-      else:
-         currentExons -= currentExons[0,0]
+   for elem in currentDNASeq:
+      #assert elem in alphabet, 'Invalid char in dna: \"%s\"' % elem
+      if not elem in alphabet:
+         return nil
 
+   if strand == '-':
       try:
-         if not (currentSeq[int(currentExons[0,1])] == 'g' and\
-         currentSeq[int(currentExons[0,1])+1] in ['c','t' ]):
+         currentDNASeq = reverse_complement(currentDNASeq)
+      except:
+         print 'invalid char'
+         return nil
+
+   currentDNASeq = 'a' + currentDNASeq
+
+   exon_stop   = fRead['exon_stop']
+   exon_start  = fRead['exon_start']
+
+   # assertions to check whether exon start/stop positions are next to donor
+   # resp. acceptor sites
+   exonEnd     = exon_stop - currentGene.start
+   exonBegin   = exon_start - currentGene.start 
+
+   current_don = currentDNASeq[exonEnd+1:exonEnd+3]
+   current_acc = currentDNASeq[exonBegin-2:exonBegin]
+   if not current_don == 'gt' or current_don == 'gc':
+      return nil
+
+   if not current_acc == 'ag':
+      return nil
+
+   #assert current_don == 'gt' or current_don == 'gc', pdb.set_trace()
+   #assert current_acc == 'ag', pdb.set_trace()
+
+   # assertions for don/acc sites held now we recalculate offset
+   exon_stop   = exon_stop - currentGene.start
+   exon_start  = exon_start - currentGene.start 
+
+   if not seq_has_indels:
+      p_start     = fRead['p_start']
+      exon_stop   = fRead['exon_stop']
+      exon_start  = fRead['exon_start']
+      p_stop      = fRead['p_stop']
+
+      assert (exon_stop - p_start) + (p_stop - exon_start) + 2 == Conf.read_size, 'Invalid read size'
+
+   else:
+      # if we have indels we have to take care of the exons boundaries
+      firstReadSeq   = currentReadSeq[:spos]
+      secondReadSeq  = currentReadSeq[spos:]
+
+      est_fragment = []
+      r_idx = 0
+      while True:
+         if not r_idx < len(currentReadSeq):
+            break
+
+         if currentReadSeq[r_idx] == '[':
+            est_fragment.append(currentReadSeq[r_idx+2])
+            r_idx += 4
             continue
 
-         if not (currentSeq[int(currentExons[1,0])-1] == 'g' and currentSeq[int(currentExons[1,0])-2] == 'a'):
+         est_fragment.append(currentReadSeq[r_idx])
+         r_idx += 1
+
+      dna_fragment_1 = []
+      r_idx = 0
+      while True:
+         if not r_idx < len(firstReadSeq):
+            break
+
+         if firstReadSeq[r_idx] == '[':
+            dna_fragment_1.append(firstReadSeq[r_idx+1])
+            r_idx += 4
             continue
 
-      except:
-         pdb.set_trace()
+         dna_fragment_1.append(firstReadSeq[r_idx])
+         r_idx += 1
 
-      Sequences.append(currentSeq)
+      dna_fragment_2 = []
+      r_idx = 0
+      while True:
+         if not r_idx < len(secondReadSeq):
+            break
 
-      # now we want to use interval_query to get the predicted splice scores
-      # trained on the TAIR sequence and annotation
+         if secondReadSeq[r_idx] == '[':
+            dna_fragment_2.append(secondReadSeq[r_idx+1])
+            r_idx += 4
+            continue
 
-      from Genefinding import *
-      
-      interval_matrix = createIntArrayFromList([currentGene.start+up_cut,currentGene.start+down_cut])
-      num_fields = 1
-      num_intervals = 1
-
-      # fetch acceptor scores
-      fname =  '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/acc/output_SVMWD/pred/contig_1+' 
-      gf = Genef()
-      num_hits = gf.query(fname, ['Conf'], num_fields, interval_matrix, num_intervals)
-      assert num_hits <= len(currentSeq)
-      
-      #print 'Acceptor hits: %d' % num_hits
+         dna_fragment_2.append(secondReadSeq[r_idx])
+         r_idx += 1
 
-      pos = createIntArrayFromList([0]*num_hits)
-      indices = createIntArrayFromList([0]*num_hits)
-      scores = createDoubleArrayFromList([0]*num_hits*num_fields)
-      gf.getResults(pos,indices,scores)
+      currentReadSeq = "".join(est_fragment)
+      dna_fragment_1 = "".join(dna_fragment_1)
+      dna_fragment_2 = "".join(dna_fragment_2)
 
-      currentAcceptors  = [-inf]*(len(currentSeq)+1)
-      
-      for i in range(num_hits):
-         position = pos[i] 
-         position -= (currentGene.start+up_cut)
-         assert 0 <= position < len(currentSeq)+1, 'position index wrong'
-         currentAcceptors[position] = scores[i]
+      dna_fragment_1_size = len([p for p in dna_fragment_1 if p != '-'])
+      dna_fragment_2_size = len([p for p in dna_fragment_2 if p != '-'])
 
-      currentAcceptors = currentAcceptors[1:] + [-inf]
-      Acceptors.append(currentAcceptors)
+      # new check the fragment of the annotation
+      new_exon1_size = dna_fragment_1_size
+      p_start     = exon_stop - new_exon1_size + 1
 
-      del gf
+      dna_annot_1 = currentDNASeq[p_start:exon_stop+1]
 
-      # fetch donor scores
-      fname =  '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_1+' 
-      gf = Genef()
-      num_hits = gf.query(fname, ['Conf'], num_fields, interval_matrix, num_intervals)
-      assert num_hits <= len(currentSeq)
+      # new check the fragment of the annotation
+      new_exon2_size = dna_fragment_2_size 
+      p_stop      = exon_start + new_exon2_size - 1
 
-      #print 'Donor hits: %d' % num_hits
-      pos = createIntArrayFromList([0]*num_hits)
-      indices = createIntArrayFromList([0]*num_hits)
-      scores = createDoubleArrayFromList([0]*num_hits*num_fields)
-      gf.getResults(pos,indices,scores)
+      dna_annot_2 = currentDNASeq[exon_start:p_stop+1]
 
-      currentDonors     = [-inf]*(len(currentSeq))
+      #assert dna_fragment_1.replace('-','') == dna_annot_1, pdb.set_trace()
+      #assert dna_fragment_2.replace('-','') == dna_annot_2, pdb.set_trace()
 
-      try:
-         for i in range(1,num_hits):
-            position = pos[i] 
-            position -= (currentGene.start+up_cut)
-            currentDonors[position-1] = scores[i]
-      except:
-         pdb.set_trace()
+      if not dna_fragment_1.replace('-','') == dna_annot_1:
+         #print 'genomic seq mismatch'
+         #print 'orignal read:\t %s ' % originalReadSeq
+         #print '1st/2nd read:\t %s %s' % (firstReadSeq,secondReadSeq)
+         #print 'dna_fragment_1:\t %s' % dna_fragment_1
+         #print 'dna_annot 1+2:\t %s %s' % (dna_annot_1,dna_annot_2)
+         return nil
 
-      currentDonors = [-inf] + currentDonors[:-1]
-      Donors.append(currentDonors)
+      if not dna_fragment_2.replace('-','') == dna_annot_2:
+         #print 'genomic seq mismatch'
+         #print 'orignal read:\t %s ' % originalReadSeq
+         #print '1st/2nd read:\t %s %s' % (firstReadSeq,secondReadSeq)
+         #print 'dna_fragment_2:\t %s' % dna_fragment_2
+         #print 'dna_annot 1+2:\t %s %s' % (dna_annot_1,dna_annot_2)
+         return nil
 
-      Exons.append(currentExons)
-      Ests.append(seq)
-      Qualities.append(prb)
+      #print 'successful'
 
-      SplitPositions.append(splitpos)
+      for elem in currentReadSeq:
+         assert elem in alphabet, 'Invalid char (alphabet) in read: \"%s\"' % elem
 
-      #if len(Sequences[-1]) == 2755:
-      #   Sequences = Sequences[:-1]
-      #   Acceptors = Acceptors[:-1]
-      #   Donors    = Donors[:-1]
-      #   Exons     = Exons[:-1]
-      #   Ests      = Ests[:-1]
-      #   Qualities = Qualities[:-1]
-      #   SplitPositions = SplitPositions[:-1]
-   
-   #print 'found %d examples' % len(Sequences)
+   #assert p_start < exon_stop < exon_start < p_stop, pdb.set_trace()
+   assert p_stop - p_start >= Conf.read_size
 
-   return Sequences, Acceptors, Donors, Exons, Ests, Qualities, SplitPositions
+   currentExons = zeros((2,2),dtype=numpy.int)
 
+   currentExons[0,0] = p_start
+   currentExons[0,1] = exon_stop
 
-      
+   currentExons[1,0] = exon_start
+   currentExons[1,1] = p_stop
+
+   # determine cutting positions for sequences
+   up_cut   = int(currentExons[0,0])
+   down_cut = int(currentExons[1,1])
+
+# if we perform testing we cut a much wider region as we want to detect
+   # how good we perform on a region
+   import random
+   ADD_POS = random.randint(Conf.extension[0],Conf.extension[1])
 
-   dataset = []
+   if test:
+      up_cut = up_cut-ADD_POS
+      if up_cut < 0:
+         up_cut = 0
 
-   {'id':'', 'chr':'', '', 'strand':,  = line.split()
+      down_cut = down_cut+ADD_POS
 
+      if down_cut > len(currentDNASeq):
+         down_cut = len(currentDNASeq)
+
+   # cut out part of the sequence
+   currentDNASeq = currentDNASeq[up_cut:down_cut]
+
+   # ensure that index is correct
+   currentExons[0,1] += 1
+   currentExons[1,1] += 1
+
+   if test:
+      currentExons -= up_cut
+   else:
+      currentExons -= currentExons[0,0]
+
+   try:
+      if not (currentDNASeq[int(currentExons[0,1])] == 'g' and\
+      currentDNASeq[int(currentExons[0,1])+1] in ['c','t' ]):
+         print 'not aligned'
+         return nil
+
+      if not (currentDNASeq[int(currentExons[1,0])-1] == 'g' and currentDNASeq[int(currentExons[1,0])-2] == 'a'):
+         print 'not aligned'
+         return nil
+
+   except:
+      return nil
+      #pdb.set_trace()
+
+   # now we want to use interval_query to get the predicted splice scores
+   # trained on the TAIR sequence and annotation
+   interval_matrix = createIntArrayFromList([currentGene.start+up_cut-100,currentGene.start+down_cut+100])
+   num_fields = 1
+   num_intervals = 1
+   sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/acc/output_SVMWD/pred/contig_%d%s'\
+   % (chr,strand)
+
+   ag_tuple_pos = [p for p,e in enumerate(currentDNASeq) if p>0 and p<len(currentDNASeq)-1 and currentDNASeq[p-1]=='a' and e=='g' ]
+
+   # fetch acceptor scores
+   gf = Genef()
+   num_hits = gf.query(sscore_filename, ['Conf'], num_fields, interval_matrix, num_intervals)
+   assert num_hits <= len(currentDNASeq)
    
-   io_pickle.save(dataset_file,dataset)
+   #print 'Acceptor hits: %d' % num_hits
+   pos = createIntArrayFromList([0]*num_hits)
+   indices = createIntArrayFromList([0]*num_hits)
+   scores = createDoubleArrayFromList([0]*num_hits*num_fields)
+   gf.getResults(pos,indices,scores)
+
+   acc  = [-inf]*(len(currentDNASeq))
+
+   for i in range(num_hits):
+      position = pos[i] 
+      position -= (currentGene.start+up_cut)
+      if position < 2 or position > len(currentDNASeq)-1:
+         continue
+      assert 0 <= position < len(currentDNASeq), 'position index wrong'
+      acc[position] = scores[i]
 
+   acc = acc[1:] + [-inf]
 
-if __name__ == '__main__':
+   del gf
 
+   # fetch donor scores
+   sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/don/output_SVMWD/pred/contig_%d%s'\
+   % (chr,strand)
+
+   gf = Genef()
+   num_hits = gf.query(sscore_filename, ['Conf'], num_fields, interval_matrix, num_intervals)
+   assert num_hits <= len(currentDNASeq)
+
+   gt_tuple_pos = [p for p,e in enumerate(currentDNASeq) if p>0 and p<len(currentDNASeq)-1 and e=='g' and (currentDNASeq[p+1]=='t' or currentDNASeq[p+1]=='c')]
+
+   #print 'Donor hits: %d' % num_hits
+   pos = createIntArrayFromList([0]*num_hits)
+   indices = createIntArrayFromList([0]*num_hits)
+   scores = createDoubleArrayFromList([0]*num_hits*num_fields)
+   gf.getResults(pos,indices,scores)
+
+   don     = [-inf]*(len(currentDNASeq))
+
+   try:
+      for i in range(num_hits):
+         position = pos[i] 
+         position -= (currentGene.start+up_cut)
+         if position < 1 or position >= len(currentDNASeq)-1:
+            continue
+         don[position-1] = scores[i]
+   except:
+      return nil
+      #pdb.set_trace()
+
+   don = [-inf] + don[:-1]
+
+   #assert ag_tuple_pos == [p for p,e in enumerate(acc) if e != -inf], pdb.set_trace()
+   #assert gt_tuple_pos == [p for p,e in enumerate(don) if e != -inf], pdb.set_trace()
+   #assert (len(currentDNASeq)) == len(acc), pdb.set_trace()
+   #assert (len(currentDNASeq)) == len(don), pdb.set_trace()
+
+
+   if not ag_tuple_pos == [p for p,e in enumerate(acc) if e != -inf]:
+      return nil
+
+   if not gt_tuple_pos == [p for p,e in enumerate(don) if e != -inf]:
+      return nil
+
+   if not (len(currentDNASeq)) == len(acc):
+      return nil
+
+   if not (len(currentDNASeq)) == len(don):
+      return nil
+
+   acc = [-inf] + acc[:-1]
+
+   return currentDNASeq, acc, don, currentExons, currentReadSeq, originalReadSeq, quality, spos
+
+
+def reverse_complement(seq):
+   map = {'a':'t','c':'g','g':'c','t':'a'}
+
+   new_seq = [map[elem] for elem in seq]
+   new_seq.reverse()
+   new_seq = "".join(new_seq)
+
+   return new_seq
+   
+if __name__ == '__main__':
    if len(sys.argv) == 1:
       print info
 
    assert len(sys.argv) == 6, help
-
-   compile_d(gff_file,dna_flat_files,solexa_reads,remapped_reads,dataset_file):
+   compile_d(gff_file,dna_flat_files,solexa_reads,remapped_reads,dataset_file)