+ removed lot of redundant/old code from the dataset generation functions
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Sun, 11 May 2008 12:07:20 +0000 (12:07 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Sun, 11 May 2008 12:07:20 +0000 (12:07 +0000)
TODO
- think about faster filtered reads parser

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@8978 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/compile_dataset.py
scripts/createNewDataset.py

index 839c91f..1d24c0e 100644 (file)
@@ -13,7 +13,6 @@ from numpy.matlib import mat,zeros,ones,inf
 
 import qpalma
 import qpalma.tools
-from qpalma.tools.parseGff import parse_gff
 
 from qpalma.parsers import *
 
@@ -23,355 +22,181 @@ from genome_utils import load_genomic
 
 import qpalma.Configuration as Conf
 
-warning = """
-   #########################################################
-   WARNING ! Disabled the use of non-bracket reads WARNING !
-   #########################################################
-   
-   """
-
-
-help = """
-
-   Usage of this script is:
-
-   ./compile_dataset.py gff_file  dna_flat_files solexa_reads remapped_reads dataset_file
 
+class DatasetGenerator:
    """
 
-info = """
-   The purpose of this script is to compile a full data set for QPalma out of the
-   following files: 
-
-   - The information of the gff (gene annotation).
-
-   - The flat dna files with the sequence of the respective gene positions
-
-   - The solexa reads in the filtered version 
-
-   - The solexa reads in the remapped version 
    """
+   
+   def __init__(self,filtered_reads,map_file):
+      assert os.path.exists(filtered_reads), 'Error: Can not find reads file'
+      self.filtered_reads = filtered_reads
 
-# some useful constants
-
-extended_alphabet = ['-','a','c','g','t','n','[',']']
-alphabet          = ['-','a','c','g','t','n']
-
-# some dummy elements to be returned if current example violates assertions
-nil_dna_frag = ('','','','','')
-remapped_nil = ('')
-process_nil  = ('','','','','','','','')
+      assert os.path.exists(map_file), 'Error: Can not find map file'
+      self.map_file = map_file
 
-nil_splice_scores = ('','')
+      self.training_set = []
+      self.testing_set  = []
 
 
-def compile_d2(gff_file,dna_flat_files,filtered_reads,remapped_reads,dataset_file,test):
-   assert os.path.exists(filtered_reads)
-   assert os.path.exists(remapped_reads)
-   assert not os.path.exists(dataset_file), 'The data_file already exists!'
+   def saveAs(dataset_file):
+      assert not os.path.exists(dataset_file), 'The data_file already exists!'
 
-   joinp = os.path.join
+      # saving new datasets
+      cPickle.dump(self.training_set,open('%s.train.pickle'%dataset_file,'w+'),protocol=2)
+      cPickle.dump(self.testing_set,open('%s.test.pickle'%dataset_file,'w+'),protocol=2)
 
-   # first read the gff file(s) and create pickled gene information
-   allGenes = [None]*6
 
-   for i in range(1,6):
-      allGenes[i]= parse_gff(gff_file%i) 
+   def compile_training_set(self):
+      joinp = os.path.join
 
-   print 'parsing map reads'
-   all_remapped_reads = parse_map_vm(remapped_reads)
-   print 'found %d remapped reads' % len(all_remapped_reads)
+      print 'parsing filtered reads..'
+      all_filtered_reads = parse_filtered_reads(self.filtered_reads)
+      print 'found %d filtered reads' % len(all_filtered_reads)
 
-   #pdb.set_trace()
+      # this stores the new dataset
+      SeqInfo        = []
+      Exons          = []
+      OriginalEsts   = []
+      Qualities      = []
 
-   print 'parsing filtered reads..'
-   all_filtered_reads = parse_filtered_reads(filtered_reads)
-   print 'found %d filtered reads' % len(all_filtered_reads)
+      # Iterate over all remapped reads in order to generate for each read a
+      # training / prediction example
+      instance_counter = 1
+      skipped_ctr = 0
 
-   # this stores the new dataset
-   SeqInfo = []
-   OriginalEsts = []
-   Qualities = []
-
-   # Iterate over all remapped reads in order to generate for each read an
-   # training / prediction example
-   instance_counter = 0
-   skipped_ctr = 0
-   num_missed_reads  = 0
+      for id,filteredRead in all_filtered_reads.items():
+         if instance_counter % 1001 == 0:
+            print 'processed %d examples' % instance_counter
 
-   for reId,allreReads in all_remapped_reads.items():
+         if filteredRead['strand'] != '+':
+            skipped_ctr += 1
+            continue
 
-         try:
-            currentFRead = all_filtered_reads[reId]
-         except:
-            skipped_ctr +=1 
-            num_missed_reads += len(allreReads) 
+         if not filteredRead['chr'] in range(1,6):
+            skipped_ctr += 1
             continue
 
-         #pdb.set_trace()
+         # we cut out the genomic region for training 
+         CUT_OFFSET = random.randint(Conf.extension[0],Conf.extension[1])
+         genomicSeq_start  = filteredRead['p_start'] - CUT_OFFSET
+         genomicSeq_stop   = filteredRead['p_stop'] + CUT_OFFSET
+
+         # information of the full read we need for training
+         chromo         = filteredRead['chr']
+         strand         = filteredRead['strand']
+         original_est   = filteredRead['seq']
+         quality        = filteredRead['prb']
+         cal_prb        = filteredRead['cal_prb']
+         chastity       = filteredRead['chastity']
+
+         # add instance to set
+         SeqInfo.append((id,chromo,strand,genomicSeq_start,genomicSeq_stop))
+         OriginalEsts.append(original_est)
+         Qualities.append( (quality,cal_prb,chastity) )
 
-         for reRead in allreReads:
-               
-            if instance_counter % 1000 == 0:
-               print 'processed %d examples' % instance_counter
+         instance_counter += 1
 
-            if reRead['strand'] != '+':
-               continue
-         
-            (id,chr,strand,genomicSeq_start,genomicSeq_stop) =\
-            process_map(reRead,currentFRead,dna_flat_files)
+      print 'Full dataset has size %d' % len(SeqInfo)
+      print 'Skipped %d reads' % skipped_ctr
 
-            # information of the full read we are allowed to know
-            original_est = currentFRead['seq']
-            quality  = currentFRead['prb']
-            cal_prb  = currentFRead['cal_prb']
-            chastity = currentFRead['chastity']
+      self.training_set = [SeqInfo, OriginalEsts, Qualities]
+      
 
-            # add instance to set
-            SeqInfo.append((id,chr,strand,genomicSeq_start,genomicSeq_stop))
-            OriginalEsts.append(original_est)
-            Qualities.append( (quality,cal_prb,chastity) )
+   def compile_testing_set(self):
 
-            instance_counter += 1
+      strand_map = ['-','+']
 
-   print 'Full dataset has size %d' % len(SeqInfo)
-   print 'Skipped %d reads' % skipped_ctr
-   print 'Num. missed reads %d reads' % num_missed_reads
+      SeqInfo = []
+      OriginalEsts = []
+      Qualities = []
 
-   dataset = [SeqInfo, OriginalEsts, Qualities ]
-
-   # saving new dataset
-   cPickle.dump(dataset,open(dataset_file,'w+'),protocol=2)
+      instance_counter = 1
 
+      for line in open(self.map_file):
+         if instance_counter % 1001 == 0:
+            print 'processed %d examples' % instance_counter
 
+         line  = line.strip()
+         slist = line.split()
+         id    = int(slist[0])
+         chromo      = int(slist[1])
+         pos      = int(slist[2])
+         strand   = slist[3]
+         strand   = strand_map[strand == 'D']
 
-"""
-def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,dataset_file,test):
+         genomicSeq_start = pos - 1500
+         genomicSeq_stop  = pos + 1500
 
-   assert os.path.exists(filtered_reads)
-   assert os.path.exists(remapped_reads)
-   assert not os.path.exists(dataset_file), 'The data_file already exists!'
+         original_est = slist[11]
+         original_est = original_est.lower()
 
-   joinp = os.path.join
+         prb      = [ord(elem)-50 for elem in slist[8]]
+         cal_prb  = [ord(elem)-64 for elem in slist[9]]
+         chastity = [ord(elem)+10 for elem in slist[10]]
 
-   # first read the gff file(s) and create pickled gene information
-   allGenes = [None]*6
+         # add instance to set
+         SeqInfo.append((id,chromo,strand,genomicSeq_start,genomicSeq_stop))
+         OriginalEsts.append(original_est)
+         Qualities.append( (prb,cal_prb,chastity) )
 
-   for i in range(1,6):
-      allGenes[i]= parse_gff(gff_file%i) #,joinp(tmp_dir,'gff_info.pickle'))
+         instance_counter += 1
 
-   #print 'parsing filtered reads..'
-   #frp = FilteredReadParser(filtered_reads)
-   #all_filtered_reads = frp.parse()
-   #print 'found %d filtered reads' % len(all_filtered_reads)
+      # store the full set
+      self.testing_set = [SeqInfo, OriginalEsts, Qualities]
 
-   print 'parsing map reads'
-   rrp = MapParser(remapped_reads)
-   all_remapped_reads = rrp.parse()
 
-   if all_remapped_reads != None:
-      size_rem = len(all_remapped_reads)
-   else:
-      size_rem = 0
+def compile_dataset_direct(filtered_reads,dataset_file):
 
-   print 'found %d remapped reads' % size_rem
+   strand_map = ['-','+']
 
-   # this stores the new dataset
    SeqInfo = []
-   Exons = []
    OriginalEsts = []
    Qualities = []
-   AlternativeSequences = []
-   SplitPositions = []
 
-   # Iterate over all remapped reads in order to generate for each read an
-   # training / prediction example
    instance_counter = 0
-   noVmatchHitCtr = 0
-   posCtr = 0
 
-   #for fId,currentFRead in all_filtered_reads.items():
-   for current_line in open(filtered_reads):
+   for line in open(filtered_reads):
+      line  = line.strip()
+      slist = line.split()
+      id    = int(slist[0])
 
-         currentFRead = parseLine(current_line)
-         fId = currentFRead['id']
-
-         #pdb.set_trace()
-
-         if currentFRead['strand'] != '+':
-            continue
-   
-         #reId = str(1000000000000+int(fId))
-         reId = str(fId)
-
-         try:
-            reReads = all_remapped_reads[reId]
-         except:
-            noVmatchHitCtr += 1
-            continue
-
-         gene_id = currentFRead['gene_id']
-         chro = currentFRead['chr']
-
-         #try:
-         #   gene_id = currentFRead['gene_id']
-         #   chro = currentFRead['chr']
-         #except:
-         #   continue
-
-         seq_info, exons, cut_pos =\
-            process_filtered_read(currentFRead,dna_flat_files,test)
-
-          seq_info = (id,chr,strand,up_cut,down_cut,true_cut) = seq_info
-
-           original_est = currentFRead['seq']
-
-         onePositiveLabel,alternative_seq = process_map(reReads,currentFRead,dna_flat_files)
+      if not id < 1000000300000:
+         continue
 
-         if onePositiveLabel:
-            posCtr += 1
+      if instance_counter % 1000 == 0:
+         print 'processed %d examples' % instance_counter
 
-         SeqInfo.append(seq_info)
-         Exons.append(exons)
-         OriginalEsts.append(original_est)
+      chr      = int(slist[1])
+      strand   = slist[2]
+      strand   = strand_map[strand == 'D']
 
-         quality  = currentFRead['prb']
-         cal_prb  = currentFRead['cal_prb']
-         chastity = currentFRead['chastity']
+      genomicSeq_start = int(slist[10]) - 1000
+      genomicSeq_stop  = int(slist[13] ) + 1000
 
-         Qualities.append( (quality,cal_prb,chastity) )
-         AlternativeSequences.append(alternative_seq)
-         SplitPositions.append(cut_pos)
+      original_est = slist[3]
+      original_est = original_est.lower()
+      #print original_est
 
-         instance_counter += 1
+      prb      = [ord(elem)-50 for elem in slist[6]]
+      cal_prb  = [ord(elem)-64 for elem in slist[7]]
+      chastity = [ord(elem)+10 for elem in slist[8]]
 
-         if instance_counter % 1000 == 0:
-            print 'processed %d examples' % instance_counter
+      #pdb.set_trace()
 
-         if instance_counter == 400000:
-            break
+      # add instance to set
+      SeqInfo.append((id,chr,strand,genomicSeq_start,genomicSeq_stop))
+      OriginalEsts.append(original_est)
+      Qualities.append( (prb,cal_prb,chastity) )
 
-   print 'Full dataset has size %d' % len(SeqInfo)
-   print 'Number of reads which had no Vmatch hit: %d' % noVmatchHitCtr
-   print 'Number of reads which had correct Vmatch hit: %d' % posCtr
+      instance_counter += 1
 
-   dataset = [SeqInfo, Exons, OriginalEsts, Qualities,\
-   AlternativeSequences, SplitPositions]
+   dataset = [SeqInfo, OriginalEsts, Qualities ]
 
    # saving new dataset
-   #io_pickle.save(dataset_file,dataset)
    cPickle.dump(dataset,open(dataset_file,'w+'),protocol=2)
-"""
-
-def process_filtered_read(fRead,dna_flat_files):
-   """
-   Creates a training example
-   """
-
-   id            = fRead['id']
-   id = int(id)
-
-   chr            = fRead['chr']
-   strand         = fRead['strand']
-
-   genomic_start  = fRead['p_start']
-   genomic_stop   = fRead['p_stop']
-
-   true_cut       = fRead['true_cut']
-
-   CUT_OFFSET = random.randint(Conf.extension[0],Conf.extension[1])
-
-   if genomic_start <= CUT_OFFSET:
-      up_cut   = genomic_start
-   else:
-      up_cut   = genomic_start - CUT_OFFSET
-
-   down_cut = genomic_stop + CUT_OFFSET
 
-   seq_info = (id,chr,strand,up_cut,down_cut,true_cut)
 
-   # now that we have p_start and p_stop properly 
-   # we can define the correct exons borders
-   if true_cut == -1:
-      currentExons = zeros((2,1),dtype=numpy.int)
-
-      currentExons[0,0] = fRead['p_start']
-      currentExons[1,0] = fRead['p_stop']
-
-      #pdb.set_trace()
-
-   else:
-      currentExons = zeros((2,2),dtype=numpy.int)
-
-      currentExons[0,0] = fRead['p_start']
-      currentExons[0,1] = fRead['exon_stop']
-
-      currentExons[1,0] = fRead['exon_start']
-      currentExons[1,1] = fRead['p_stop']
-
-   return seq_info, currentExons, fRead['true_cut']
-
-
-def _getSpliceScores(chr,strand,intervalBegin,intervalEnd):
-   """
-   Now we want to use interval_query to get the predicted splice scores trained
-   on the TAIR sequence and annotation.
-   """
-   
-   assert intervalEnd > intervalBegin
-
-   size = intervalEnd-intervalBegin
-
-   acc = size*[0.0]
-   don = size*[0.0]
-
-   interval_matrix = createIntArrayFromList([intervalBegin,intervalEnd])
-   pos_size = new_intp()
-   intp_assign(pos_size,1)
-
-   # fetch acceptor scores
-   sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc/contig_%d%s' % (chr,strand)
-   result = interval_query(sscore_filename, ['Conf'], 1, interval_matrix, 1,pos_size)
-
-   num_hits = result[0]
-   assert num_hits <= size, pdb.set_trace()
-   pos      =  result[1]
-   scores   =  result[3]
-
-   acc  = [-inf]*(size)
-   
-   #print [p - intervalBegin for p in pos]
-
-   for i in range(num_hits):
-      position = pos[i] - intervalBegin
-      if not (position > 1 and position < size):
-         continue
-
-      assert 0 <= position <= size, pdb.set_trace()
-      acc[position] = scores[i]
-
-   # fetch acceptor scores
-   sscore_filename = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don/contig_%d%s' % (chr,strand)
-   result = interval_query(sscore_filename, ['Conf'], 1, interval_matrix, 1,pos_size)
-
-   num_hits = result[0]
-   assert num_hits <= size, pdb.set_trace()
-   pos      =  result[1]
-   scores   =  result[3]
-
-   don     = [-inf]*(size)
-
-   #print [p - intervalBegin for p in pos]
-
-   for i in range(num_hits):
-      position = pos[i] - intervalBegin
-      if not ( position > 1 and position < size ):
-         continue
-
-      assert 0 <= position <= size, pdb.set_trace()
-      don[position] = scores[i]
-
-   return acc, don
 
 
 def getSpliceScores(chr,strand,intervalBegin,intervalEnd):
@@ -379,10 +204,9 @@ def getSpliceScores(chr,strand,intervalBegin,intervalEnd):
    Now we want to use interval_query to get the predicted splice scores trained
    on the TAIR sequence and annotation.
    """
-   
-   assert intervalEnd > intervalBegin
 
    size = intervalEnd-intervalBegin
+   assert size > 1, 'Error (getSpliceScores): interval size is less than 2!'
 
    acc = size*[0.0]
    don = size*[0.0]
@@ -402,7 +226,7 @@ def getSpliceScores(chr,strand,intervalBegin,intervalEnd):
    return acc, don
 
 
-def process_map(currentRRead,fRead,dna_flat_files):
+def process_map(currentRRead,fRead):
    """
    For all matches found by Vmatch we calculate the fragment of the DNA we
    would like to perform an aligment during prediction.
@@ -441,16 +265,16 @@ def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_file
    returns then the genomic sequence of this interval and the associated scores.
    """
 
+   assert genomicSeq_start < genomicSeq_stop
+
    chrom         = 'chr%d' % chr
    genomicSeq = load_genomic(chrom,strand,genomicSeq_start-1,genomicSeq_stop,dna_flat_files,one_based=False)
    genomicSeq = genomicSeq.lower()
 
-
    # check the obtained dna sequence
    assert genomicSeq != '', 'load_genomic returned empty sequence!'
-   #for elem in genomicSeq:
-   #   if not elem in alphabet:
    
+   # all entries other than a c g t are set to n
    no_base = re.compile('[^acgt]')
    genomicSeq = no_base.sub('n',genomicSeq)
 
@@ -488,6 +312,7 @@ def reverse_complement(seq):
 
    return new_seq
 
+
 def get_seq(begin,end,exon_end):
    """
    """
@@ -510,14 +335,6 @@ def get_seq(begin,end,exon_end):
    return genomicSeq
 
 
-if __name__ == '__main__':
-   if len(sys.argv) == 1:
-      print info
-
-   assert len(sys.argv) == 6, help
-   compile_d(gff_file,dna_flat_files,solexa_reads,remapped_reads,dataset_file)
-
-
 def parseLine(line):
    """
    We assume that a line has the following entries:
@@ -562,3 +379,11 @@ def parseLine(line):
    'p_stop':p_stop,'true_cut':true_cut}
 
    return line_d
+
+
+if __name__ == '__main__':
+   if len(sys.argv) == 1:
+      print info
+
+   assert len(sys.argv) == 6, help
+   compile_d(gff_file,dna_flat_files,solexa_reads,remapped_reads,dataset_file)
index 15608de..a8a761b 100644 (file)
@@ -2,15 +2,18 @@
 # -*- coding: utf-8 -*-
 
 import qpalma.Configuration as Conf
-from compile_dataset import compile_d2
+from compile_dataset import DatasetGenerator
 
-#filtered_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/allReads.full'
-#remapped_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/map.vm_cut'
+#filtered_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/paper_data/allReads.full.newid'
+#remapped_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/paper_data/map_2nd.vm_chr'
 
-#remapped_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/paper_data/spliced.result_no_comm'
+filtered_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/paper_data/allReads.test'
+remapped_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/paper_data/map.test'
 
+#compile_d2(Conf.dna_flat_fn,filtered_fn,remapped_fn,'dataset_2nd')
+#compile_dataset_direct(filtered_fn,'dataset_all_spliced_reads_2.pickle')
 
-filtered_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/paper_data/allReads.full.newid'
-remapped_fn = '/fml/ag-raetsch/share/projects/qpalma/solexa/paper_data/map_2nd.vm_chr'
-
-compile_d2(Conf.gff_fn,Conf.dna_flat_fn,filtered_fn,remapped_fn,'dataset_2nd',True)
+dg = DatasetGenerator(filtered_fn,remapped_fn)
+dg.compile_training_set()
+dg.compile_testing_set()
+dg.saveAs('dataset_11_05')