+ added use of all remapped possibilities of a given read
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 21 Feb 2008 15:21:40 +0000 (15:21 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Thu, 21 Feb 2008 15:21:40 +0000 (15:21 +0000)
git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@7932 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/compile_dataset.py

index 56e7b42..8557fad 100644 (file)
@@ -73,6 +73,7 @@ alphabet          = ['-','a','c','g','t','n']
 
 # some dummy elements to be returned if current example violates assertions
 nil_dna_frag = ('','','','')
+remapped_nil = ('','','')
 process_nil  = ('','','','','','')
 
 nil_splice_scores = ('','')
@@ -122,6 +123,10 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
    FReads = []
    RemappedReads = []
 
+   AlternativeSequences = []
+   AlternativeAcceptors = []
+   AlternativeDonors    = []
+
    # Iterate over all remapped reads in order to generate for each read an
    # training / prediction example
    instance_counter = 0
@@ -148,7 +153,9 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
          seq, acc, don, exons, est, qual =\
             process_filtered_read(currentFRead,currentGene,dna_flat_files,test)
 
-         #alternative_seq, alternative_acc, alternative_don = process_remapped_reads(reReads)
+         original_est = currentFRead['seq']
+
+         alternative_seq, alternative_acc, alternative_don = process_remapped_reads(reReads,currentFRead,dna_flat_files)
 
          if seq == '':
             continue
@@ -158,19 +165,19 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
          Donors.append(don)
          Exons.append(exons)
          Ests.append(est)
-         #OriginalEsts.append(original_est)
+         OriginalEsts.append(original_est)
          Qualities.append(qual)
          #SplitPositions.append(spos)
          #Ids.append(id)
          #FReads.append(currentFRead)
 
-         #AlternativeSequences.append(alternative_seq)
-         #AlternativeDonors.append(alternative_don)
-         #AlternativeAcceptors.append(alternative_acc)
+         AlternativeSequences.append(alternative_seq)
+         AlternativeAcceptors.append(alternative_acc)
+         AlternativeDonors.append(alternative_don)
 
          instance_counter += 1
 
-         if instance_counter % 100 == 0:
+         if instance_counter % 1000 == 0:
             print 'processed %d examples' % instance_counter
 
          if instance_counter == 10000:
@@ -178,9 +185,19 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
 
    print 'Full dataset has size %d' % len(Sequences)
 
-   dataset = {'Sequences':Sequences, 'Acceptors':Acceptors, 'Donors':Donors,\
-   'Exons':Exons, 'Ests':Ests, 'OriginalEsts':OriginalEsts, 'Qualities':Qualities,\
-   'SplitPositions':SplitPositions,'Ids':Ids, 'FReads':FReads}
+   dataset = [Sequences, Acceptors, Donors,\
+   Exons, Ests, OriginalEsts, Qualities,\
+   AlternativeSequences,\
+   AlternativeAcceptors,\
+   AlternativeDonors]
+
+   #dataset = {'Sequences':Sequences, 'Acceptors':Acceptors, 'Donors':Donors,\
+   #'Exons':Exons, 'Ests':Ests, 'OriginalEsts':OriginalEsts, 'Qualities':Qualities,\
+   #'AlternativeSequences':AlternativeSequences,\
+   #'AlternativeAcceptors':AlternativeAcceptors,\
+   #'AlternativeDonors':AlternativeDonors}
+
+   #'Ids':Ids, 'FReads':FReads}
 
    # saving new dataset
    io_pickle.save(dataset_file,dataset)
@@ -448,18 +465,22 @@ def getSpliceScores(chr,strand,intervalBegin,intervalEnd,currentDNASeq,seq_pos_o
 
    #print 'entering getSpliceScores...'
 
+   ag_tuple_pos = [p for p,e in enumerate(currentDNASeq) if p>0 and p<len(currentDNASeq)-1 and currentDNASeq[p-1]=='a' and e=='g' ]
+   gt_tuple_pos = [p for p,e in enumerate(currentDNASeq) if p>0 and p<len(currentDNASeq)-1 and e=='g' and (currentDNASeq[p+1]=='t' or currentDNASeq[p+1]=='c')]
+
    acc = len(currentDNASeq)*[0.0]
    don = len(currentDNASeq)*[0.0]
 
    interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
+   pos_size = new_intp()
+   intp_assign(pos_size,1)
+
+   # now fetch acceptor and donor scores:
+
    sscore_filename = '/fml/ag-raetsch/share/projects/genefinding/A_thaliana/jonas_new_genes/exp/sensors/acc/output_SVMWD/pred/contig_%d%s'\
    % (chr,strand)
 
-   ag_tuple_pos = [p for p,e in enumerate(currentDNASeq) if p>0 and p<len(currentDNASeq)-1 and currentDNASeq[p-1]=='a' and e=='g' ]
-
    # fetch acceptor scores
-   pos_size = new_intp()
-   intp_assign(pos_size,1)
    result = interval_query(sscore_filename, ['Conf'], 1, interval_matrix, 1,pos_size)
 
    num_hits = result[0]
@@ -490,8 +511,6 @@ def getSpliceScores(chr,strand,intervalBegin,intervalEnd,currentDNASeq,seq_pos_o
    pos      =  result[1]
    scores   =  result[3]
 
-   gt_tuple_pos = [p for p,e in enumerate(currentDNASeq) if p>0 and p<len(currentDNASeq)-1 and e=='g' and (currentDNASeq[p+1]=='t' or currentDNASeq[p+1]=='c')]
-
    #print 'Donor hits: %d' % num_hits
    don     = [-inf]*(len(currentDNASeq))
 
@@ -515,6 +534,7 @@ def getSpliceScores(chr,strand,intervalBegin,intervalEnd,currentDNASeq,seq_pos_o
 
    if not ag_tuple_pos == [p for p,e in enumerate(acc) if e != -inf]:
       print 'problem with acceptor scores'
+      pydb.debugger()
       return nil_splice_scores
 
    if not gt_tuple_pos == [p for p,e in enumerate(don) if e != -inf]:
@@ -534,46 +554,72 @@ def getSpliceScores(chr,strand,intervalBegin,intervalEnd,currentDNASeq,seq_pos_o
    return don, acc
 
 
-def process_remapped_reads(reReads,fRead):
+def process_remapped_reads(reReads,fRead,dna_flat_files):
    """
    For all matches found by Vmatch we calculate the fragment of the DNA we
    would like to perform an aligment during prediction.
    """
 
-   truePos = fRead['pos']
+   fragment_size = 3000
+
+   seq = fRead['seq']
+   strand = fRead['strand']
+   pos = fRead['p_start']
+   chr = fRead['chr']
+   chrom         = 'chr%d' % chr
+
+   allDNASeq   = []
+   allAcc      = []
+   allDon      = []
 
    for currentRRead in reReads:
-      rId            = rRead['id']
+      rId            = currentRRead['id']
 
-      pos1           = rRead['pos1']
-      chr1           = rRead['chr1']
-      seq1           = rRead['seq1']
+      pos1           = currentRRead['pos1']
+      chr1           = currentRRead['chr1']
+      seq1           = currentRRead['seq1']
 
-      both_matches = False
+      s1_pos = seq.find(seq1)
 
-      if both_matches:
-         pos2           = rRead['pos2']
-         chr2           = rRead['chr2']
-         seq2           = rRead['seq2']
+      seq2           = currentRRead['seq2']
 
-      # try to find the anchor of the current read i.e. that part that is bigger
-      # and matched such that we can be sure that it is somehow correctly aligned
-      if pos2 > pos1:
-         remappedPos = pos1
+      if not seq2 == '':
+         pos2           = currentRRead['pos2']
+         chr2           = currentRRead['chr2']
+         s2_pos = seq.find(seq2)
+   
+      # vmatch found a correct position
+      if pos + s1_pos == pos1:
+         genomicSeq_start  = pos
+         genomicSeq_stop   = fRead['p_stop']
       else:
-         remappedPos = pos2
-         hit_pos = currentReadSeq.find(remapped_seq)
+         genomicSeq_start  = pos1
+         genomicSeq_stop   = pos1 + fragment_size
+         
+      #return remapped_nil
+
+      genomicSeq = load_genomic(chrom,strand,genomicSeq_start-1,genomicSeq_stop,dna_flat_files,one_based=False)
+      genomicSeq = genomicSeq.lower()
 
-         correct_remapping = False
+      # check the obtained dna sequence
+      assert genomicSeq != '', 'load_genomic returned empty sequence!'
+      for elem in genomicSeq:
+         assert elem in alphabet, pdb.set_trace()
 
-         first_start    = pos1
-         first_stop     = pos1+len(seq1)-1
+      intervalBegin  = genomicSeq_start-100
+      intervalEnd    = genomicSeq_stop+100
+      currentDNASeq  = genomicSeq
+      seq_pos_offset = genomicSeq_start
+
+      #pydb.debugger()
 
-         second_start   = pos2
-         second_stop    = pos2+len(seq2)-1
+      currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd,currentDNASeq,seq_pos_offset)
 
-         total_fragment_size = second_stop - first_start
+      allDNASeq.append(currentDNASeq)
+      allAcc.append(currentAcc)
+      allDon.append(currentDon)
 
+   return allDNASeq, allAcc, allDon
 
 def reverse_complement(seq):
    map = {'a':'t','c':'g','g':'c','t':'a'}