+ original reads sequence now has to be reconstructred automatically as we have
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 30 May 2008 12:00:47 +0000 (12:00 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 30 May 2008 12:00:47 +0000 (12:00 +0000)
no ground truth file anymore

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@9309 e1793c9e-67f9-0310-80fc-b846ff1f7b36

tools/run_specific_scripts/transcriptome_analysis/compile_dataset.py
tools/run_specific_scripts/transcriptome_analysis/createNewDataset.py

index d25c392..600e420 100644 (file)
@@ -8,20 +8,33 @@ import re
 import pdb
 import cPickle
 
-import numpy
-from numpy.matlib import mat,zeros,ones,inf
+#import numpy
+#from numpy.matlib import mat,zeros,ones,inf
 
 import qpalma
 import qpalma.tools
 
 from qpalma.parsers import *
+from qpalma.sequence_utils import *
 
-from Genefinding import *
-
-from genome_utils import load_genomic
+#from Genefinding import *
+#from genome_utils import load_genomic
 
 import qpalma.Configuration as Conf
 
+def create_bracket_seq(dna_seq,read_seq):
+   """
+   This function takes a dna sequence and a read sequence and returns the
+   bracket format of the match/mismatches i.e.
+
+   dna : aaa
+   read: aac
+   is written in bracket notation: aa[ac]
+   """
+   assert len(dna_seq) == len(read_seq)
+   return "".join(map(lambda x,y: ['[%s%s]'%(x,y),x][x==y],dna_seq,read_seq))
+      
+
 
 class DatasetGenerator:
    """
@@ -49,7 +62,7 @@ class DatasetGenerator:
       cPickle.dump(self.dataset.keys(),open(dataset_keys_fn,'w+'),protocol=2)
 
 
-   def parse_map_file(self,dataset,map_file):
+   def parse_map_file(self,dataset,map_file,first_map):
       strand_map = ['-','+']
       instance_counter = 1
 
@@ -65,6 +78,15 @@ class DatasetGenerator:
          strand   = slist[3]
          strand   = strand_map[strand == 'D']
 
+         if first_map:
+            read_seq = slist[7]
+         else:
+            read_seq = slist[11]
+
+         _prb      = slist[8]
+         _cal_prb  = slist[9]
+         _chastity = slist[10]
+
          if strand != '+':
             continue
 
@@ -75,14 +97,18 @@ class DatasetGenerator:
          genomicSeq_stop  = pos + 1500
 
          # fetch missing information from original reads
-         filteredRead = self.all_filtered_reads[id]
-         original_est = filteredRead['seq']
-         original_est = original_est.lower()
+         flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+         original_info = get_seq_and_scores(chromo,strand,pos,pos+35,flat_files)
+         dna_seq = original_info[0]
+
+         #filteredRead = self.all_filtered_reads[id]
+         #original_est = filteredRead['seq']
+         original_est = create_bracket_seq(dna_seq,read_seq)
 
-         original_est = filteredRead['seq']
-         prb     = filteredRead['prb']
-         cal_prb = filteredRead['cal_prb']
-         chastity = filteredRead['chastity']
+      for idx in range(read_size):
+         prb[idx]       = ord(_prb[idx])-50
+         cal_prb[idx]   = ord(_cal_prb[idx])-64
+         chastity[idx]  = ord(_chastity[idx])+10
 
          # add instance to set
          currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
@@ -110,167 +136,8 @@ class DatasetGenerator:
       # usually we have two files to parse:
       # the map file from the second run and a subset of the map file from the
       # first run
-      dataset = self.parse_map_file(dataset,self.map_file)
-      dataset = self.parse_map_file(dataset,self.map_2nd_file)
+      dataset = self.parse_map_file(dataset,self.map_file,True)
+      dataset = self.parse_map_file(dataset,self.map_2nd_file,False)
 
       # store the full set
       self.testing_set = dataset
-
-
-
-
-def getSpliceScores(chr,strand,intervalBegin,intervalEnd):
-   """
-   Now we want to use interval_query to get the predicted splice scores trained
-   on the TAIR sequence and annotation.
-   """
-
-   size = intervalEnd-intervalBegin
-   assert size > 1, 'Error (getSpliceScores): interval size is less than 2!'
-
-   acc = size*[0.0]
-   don = size*[0.0]
-
-   interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
-   pos_size = new_intp()
-   intp_assign(pos_size,1)
-
-   # fetch acceptor scores
-   sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s'
-   acc = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
-
-   # fetch donor scores
-   sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s'
-   don = doIntervalQuery(chr,strand,intervalBegin,intervalEnd,sscore_filename)
-
-   return acc, don
-
-
-def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files):
-   """
-   This function expects an interval, chromosome and strand information and
-   returns then the genomic sequence of this interval and the associated scores.
-   """
-
-   assert genomicSeq_start < genomicSeq_stop
-
-   chrom         = 'chr%d' % chr
-   genomicSeq = load_genomic(chrom,strand,genomicSeq_start-1,genomicSeq_stop,dna_flat_files,one_based=False)
-   genomicSeq = genomicSeq.lower()
-
-   # check the obtained dna sequence
-   assert genomicSeq != '', 'load_genomic returned empty sequence!'
-   
-   # all entries other than a c g t are set to n
-   no_base = re.compile('[^acgt]')
-   genomicSeq = no_base.sub('n',genomicSeq)
-
-   intervalBegin  = genomicSeq_start-100
-   intervalEnd    = genomicSeq_stop+100
-   seq_pos_offset = genomicSeq_start
-
-   currentAcc, currentDon = getSpliceScores(chr,strand,intervalBegin,intervalEnd)
-
-   currentAcc = currentAcc[100:-98]
-   currentAcc = currentAcc[1:]
-   currentDon = currentDon[100:-100]
-
-   length = len(genomicSeq)
-   currentAcc = currentAcc[:length]
-
-   currentDon = currentDon+[-inf]*(length-len(currentDon))
-
-   ag_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>1 and genomicSeq[p-1]=='a' and genomicSeq[p]=='g' ]
-   gt_tuple_pos = [p for p,e in enumerate(genomicSeq) if p>0 and p<len(genomicSeq)-1 and e=='g' and (genomicSeq[p+1]=='t' or genomicSeq[p+1]=='c')]
-
-   assert ag_tuple_pos == [p for p,e in enumerate(currentAcc) if e != -inf and p > 1], pdb.set_trace()
-   assert gt_tuple_pos == [p for p,e in enumerate(currentDon) if e != -inf and p > 0], pdb.set_trace()
-   assert len(currentAcc) == len(currentDon)
-
-   return genomicSeq, currentAcc, currentDon
-
-
-def reverse_complement(seq):
-   map = {'a':'t','c':'g','g':'c','t':'a'}
-
-   new_seq = [map[elem] for elem in seq]
-   new_seq.reverse()
-   new_seq = "".join(new_seq)
-
-   return new_seq
-
-
-def get_seq(begin,end,exon_end):
-   """
-   """
-
-   dna_flat_files    =  '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
-
-   if exon_end:
-      gene_start = begin
-      gene_stop  = end+2
-   else:
-      gene_start = begin-2
-      gene_stop  = end
-
-   chrom         = 'chr%d' % 1
-   strand        = '+'
-
-   genomicSeq = load_genomic(chrom,strand,gene_start,gene_stop,dna_flat_files,one_based=False)
-   genomicSeq = genomicSeq.lower()
-
-   return genomicSeq
-
-
-def parseLine(line):
-   """
-   We assume that a line has the following entries:
-   
-   read_nr,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop
-
-   """
-   #try:
-   id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop,true_cut = line.split()
-   #except:
-   #   id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
-   #   true_cut = -1
-
-   splitpos = int(splitpos)
-   read_size = int(read_size)
-
-   seq=seq.lower()
-
-   assert strand in ['D','P']
-
-   if strand == 'D':
-      strand = '+'
-
-   if strand == 'P':
-      strand = '-'
-
-   chr = int(chr)
-
-   prb = [ord(elem)-50 for elem in prb]
-   cal_prb = [ord(elem)-64 for elem in cal_prb]
-   chastity = [ord(elem)+10 for elem in chastity]
-
-   p_start = int(p_start)
-   exon_stop = int(exon_stop)
-   exon_start = int(exon_start)
-   p_stop = int(p_stop)
-   true_cut = int(true_cut)
-
-   line_d = {'id':id, 'chr':chr, 'strand':strand, 'seq':seq, 'splitpos':splitpos,\
-   'read_size':read_size, 'prb':prb, 'cal_prb':cal_prb, 'chastity':chastity, 'gene_id':gene_id,\
-   'p_start':p_start, 'exon_stop':exon_stop, 'exon_start':exon_start,\
-   'p_stop':p_stop,'true_cut':true_cut}
-
-   return line_d
-
-
-if __name__ == '__main__':
-   if len(sys.argv) == 1:
-      print info
-
-   assert len(sys.argv) == 6, help
-   compile_d(gff_file,dna_flat_files,solexa_reads,remapped_reads,dataset_file)
index 06961a2..ca1b7c7 100644 (file)
@@ -4,7 +4,8 @@
 import qpalma.Configuration as Conf
 from compile_dataset import DatasetGenerator
 
-map_1_fn = '/tmp/fabio/spliced.heuristic'
+#map_1_fn = '/tmp/fabio/spliced.heuristic'
+map_1_fn = '/tmp/fabio/map_2nd.vm'
 map_2_fn = '/tmp/fabio/map_2nd.vm'
 
 dg = DatasetGenerator(map_1_fn,map_2_fn)