+ using new parsers now
authorfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 18 Apr 2008 23:52:29 +0000 (23:52 +0000)
committerfabio <fabio@e1793c9e-67f9-0310-80fc-b846ff1f7b36>
Fri, 18 Apr 2008 23:52:29 +0000 (23:52 +0000)
+ store only minimum amount of information needed to perform prediction

git-svn-id: http://svn.tuebingen.mpg.de/ag-raetsch/projects/QPalma@8697 e1793c9e-67f9-0310-80fc-b846ff1f7b36

scripts/compile_dataset.py

index df619a5..0be5ec0 100644 (file)
@@ -15,7 +15,7 @@ import qpalma
 import qpalma.tools
 from qpalma.tools.parseGff import parse_gff
 
-from qpalma.parsers import FilteredReadParser, RemappedReadParser, MapParser
+from qpalma.parsers import *
 
 from Genefinding import *
 
@@ -50,20 +50,6 @@ info = """
    - The solexa reads in the filtered version 
 
    - The solexa reads in the remapped version 
-
-   This represents a dataset for use with QPalma
-   The entries are def'd as follows:
-
-   - 'id' a unique id for the read
-   - 'chr' the chromosome identifier
-   - 'strand' the identifier for the strand either '+' or '-'
-
-   - 'read_start' 
-   - 'exon_stop'
-   - 'exon_start'
-   - 'read_stop'
-
-   - 'remappped_pos' - the position of the remapped read
    """
 
 # some useful constants
@@ -78,52 +64,88 @@ process_nil  = ('','','','','','','','')
 
 nil_splice_scores = ('','')
 
-def parseLine(line):
-   """
-   We assume that a line has the following entries:
-   
-   read_nr,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop
 
-   """
-   #try:
-   id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop,true_cut = line.split()
-   #except:
-   #   id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
-   #   true_cut = -1
+def compile_d2(gff_file,dna_flat_files,filtered_reads,remapped_reads,dataset_file,test):
+   assert os.path.exists(filtered_reads)
+   assert os.path.exists(remapped_reads)
+   assert not os.path.exists(dataset_file), 'The data_file already exists!'
 
-   splitpos = int(splitpos)
-   read_size = int(read_size)
+   joinp = os.path.join
 
-   seq=seq.lower()
+   # first read the gff file(s) and create pickled gene information
+   allGenes = [None]*6
 
-   assert strand in ['D','P']
+   for i in range(1,6):
+      allGenes[i]= parse_gff(gff_file%i) 
 
-   if strand == 'D':
-      strand = '+'
+   print 'parsing filtered reads..'
+   all_filtered_reads = parse_filtered_reads(filtered_reads)
+   print 'found %d filtered reads' % len(all_filtered_reads)
 
-   if strand == 'P':
-      strand = '-'
+   print 'parsing map reads'
+   all_remapped_reads = parse_map_vm(remapped_reads)
+   print 'found %d remapped reads' % len(all_remapped_reads)
 
-   chr = int(chr)
+   # this stores the new dataset
+   SeqInfo = []
+   OriginalEsts = []
+   Qualities = []
 
-   prb = [ord(elem)-50 for elem in prb]
-   cal_prb = [ord(elem)-64 for elem in cal_prb]
-   chastity = [ord(elem)+10 for elem in chastity]
+   # Iterate over all remapped reads in order to generate for each read an
+   # training / prediction example
+   instance_counter = 0
+   skipped_ctr = 0
 
-   p_start = int(p_start)
-   exon_stop = int(exon_stop)
-   exon_start = int(exon_start)
-   p_stop = int(p_stop)
-   true_cut = int(true_cut)
+   for reId,allreReads in all_remapped_reads.items():
 
-   line_d = {'id':id, 'chr':chr, 'strand':strand, 'seq':seq, 'splitpos':splitpos,\
-   'read_size':read_size, 'prb':prb, 'cal_prb':cal_prb, 'chastity':chastity, 'gene_id':gene_id,\
-   'p_start':p_start, 'exon_stop':exon_stop, 'exon_start':exon_start,\
-   'p_stop':p_stop,'true_cut':true_cut}
+         try:
+            currentFRead = all_filtered_reads[reId]
+         except:
+            skipped_ctr +=1 
+            continue
 
-   return line_d
+         for reRead in allreReads:
+               
+            if instance_counter % 1000 == 0:
+               print 'processed %d examples' % instance_counter
+
+            #if instance_counter == 400000:
+            #   break
+
+            if reRead['strand'] != '+':
+               continue
+   
+            #seq_info, exons, cut_pos =\
+            #   process_filtered_read(currentFRead,dna_flat_files)
+         
+            (id,chr,strand,genomicSeq_start,genomicSeq_stop) =\
+            process_map(reRead,currentFRead,dna_flat_files)
+
+            # information of the full read we are allowed to know
+            original_est = currentFRead['seq']
+            quality  = currentFRead['prb']
+            cal_prb  = currentFRead['cal_prb']
+            chastity = currentFRead['chastity']
 
+            # add instance to set
+            SeqInfo.append((id,chr,strand,genomicSeq_start,genomicSeq_stop))
+            OriginalEsts.append(original_est)
+            Qualities.append( (quality,cal_prb,chastity) )
+
+            instance_counter += 1
+
+
+   print 'Full dataset has size %d' % len(SeqInfo)
+   print 'Skipped %d reads' % skipped_ctr
 
+   dataset = [SeqInfo, OriginalEsts, Qualities ]
+
+   # saving new dataset
+   cPickle.dump(dataset,open(dataset_file,'w+'),protocol=2)
+
+
+
+"""
 def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,dataset_file,test):
 
    assert os.path.exists(filtered_reads)
@@ -200,7 +222,9 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
          seq_info, exons, cut_pos =\
             process_filtered_read(currentFRead,dna_flat_files,test)
 
-         original_est = currentFRead['seq']
+          seq_info = (id,chr,strand,up_cut,down_cut,true_cut) = seq_info
+
+           original_est = currentFRead['seq']
 
          onePositiveLabel,alternative_seq = process_map(reReads,currentFRead,dna_flat_files)
 
@@ -224,7 +248,7 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
          if instance_counter % 1000 == 0:
             print 'processed %d examples' % instance_counter
 
-         if instance_counter == 200000:
+         if instance_counter == 400000:
             break
 
    print 'Full dataset has size %d' % len(SeqInfo)
@@ -237,9 +261,9 @@ def compile_d(gff_file,dna_flat_files,filtered_reads,remapped_reads,tmp_dir,data
    # saving new dataset
    #io_pickle.save(dataset_file,dataset)
    cPickle.dump(dataset,open(dataset_file,'w+'),protocol=2)
+"""
 
-
-def process_filtered_read(fRead,dna_flat_files,test):
+def process_filtered_read(fRead,dna_flat_files):
    """
    Creates a training example
    """
@@ -378,7 +402,7 @@ def getSpliceScores(chr,strand,intervalBegin,intervalEnd):
    return acc, don
 
 
-def process_map(reReads,fRead,dna_flat_files):
+def process_map(currentRRead,fRead,dna_flat_files):
    """
    For all matches found by Vmatch we calculate the fragment of the DNA we
    would like to perform an aligment during prediction.
@@ -390,74 +414,25 @@ def process_map(reReads,fRead,dna_flat_files):
 
    onePositiveLabel = False
 
-   for currentRRead in reReads:
-      rId      = currentRRead['id']
-      pos      = currentRRead['pos']
-      chr      = currentRRead['chr']
-      strand   = currentRRead['strand']
-      seq      = currentRRead['seq']
+   rId      = currentRRead['id']
+   pos      = currentRRead['pos']
+   chr      = currentRRead['chr']
+   strand   = currentRRead['strand']
 
-      if strand != '+':
-         continue
-
-      length   = currentRRead['length']
-      offset   = currentRRead['offset']
-
-      currentLabel = False
-
-      CUT_OFFSET = random.randint(Conf.extension[0],Conf.extension[1])
-   
-      # vmatch found the correct position
-      if fRead['chr'] == chr and fRead['strand'] == strand and fRead['p_start']-36 <= pos <= fRead['p_stop']+36:
-         currentLabel = True
-         genomicSeq_start = fRead['p_start'] - CUT_OFFSET
-         genomicSeq_stop = fRead['p_stop'] + CUT_OFFSET
-         #pdb.set_trace()
-      else:
-         #pdb.set_trace()
-         genomicSeq_start = pos - (fragment_size/2)
-         genomicSeq_stop = pos + (fragment_size/2)
-
-      onePositiveLabel = onePositiveLabel or currentLabel
-
-      alternativeSeq.append( (chr,strand,genomicSeq_start,genomicSeq_stop,currentLabel) )
-
-
-   return onePositiveLabel,alternativeSeq
-
-
-def check_remapped_reads(reReads,fRead,dna_flat_files):
-   """
-   For all matches found by Vmatch we calculate the fragment of the DNA we
-   would like to perform an aligment during prediction.
-   """
-
-   fragment_size = 3000
-
-   seq = fRead['seq']
-   strand = fRead['strand']
-   pos = fRead['p_start']
-   chr = fRead['chr']
-
-   print 'fRead:\t%d %d %d %d' %\
-   (fRead['p_start'],fRead['exon_stop'],fRead['exon_start'],fRead['p_stop'])
+   length   = currentRRead['length']
+   offset   = currentRRead['offset']
 
-   for currentRRead in reReads:
-      rId            = currentRRead['id']
-
-      pos1           = currentRRead['pos1']
-      chr1           = currentRRead['chr1']
-      seq1           = currentRRead['seq1']
+   CUT_OFFSET = random.randint(Conf.extension[0],Conf.extension[1])
 
-      seq2           = currentRRead['seq2']
-      pos2           = 0
+   # vmatch found the correct position
+   if fRead['chr'] == chr and fRead['strand'] == strand and fRead['p_start']-36 <= pos <= fRead['p_stop']+36:
+      genomicSeq_start = fRead['p_start'] - CUT_OFFSET
+      genomicSeq_stop = fRead['p_stop'] + CUT_OFFSET
+   else:
+      genomicSeq_start = pos - (fragment_size/2)
+      genomicSeq_stop = pos + (fragment_size/2)
 
-      if not seq2 == '':
-         pos2           = currentRRead['pos2']
-         chr2           = currentRRead['chr2']
-         s2_pos = seq.find(seq2)
-   
-      print 'reRead:\t %d %d %d %d'%(pos1,pos1+len(seq1),pos2,pos2+len(seq2))
+   return (rId,chr,strand,genomicSeq_start,genomicSeq_stop)
 
 
 def get_seq_and_scores(chr,strand,genomicSeq_start,genomicSeq_stop,dna_flat_files):
@@ -541,3 +516,49 @@ if __name__ == '__main__':
 
    assert len(sys.argv) == 6, help
    compile_d(gff_file,dna_flat_files,solexa_reads,remapped_reads,dataset_file)
+
+
+def parseLine(line):
+   """
+   We assume that a line has the following entries:
+   
+   read_nr,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop
+
+   """
+   #try:
+   id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop,true_cut = line.split()
+   #except:
+   #   id,chr,strand,seq,splitpos,read_size,prb,cal_prb,chastity,gene_id,p_start,exon_stop,exon_start,p_stop = line.split()
+   #   true_cut = -1
+
+   splitpos = int(splitpos)
+   read_size = int(read_size)
+
+   seq=seq.lower()
+
+   assert strand in ['D','P']
+
+   if strand == 'D':
+      strand = '+'
+
+   if strand == 'P':
+      strand = '-'
+
+   chr = int(chr)
+
+   prb = [ord(elem)-50 for elem in prb]
+   cal_prb = [ord(elem)-64 for elem in cal_prb]
+   chastity = [ord(elem)+10 for elem in chastity]
+
+   p_start = int(p_start)
+   exon_stop = int(exon_stop)
+   exon_start = int(exon_start)
+   p_stop = int(p_stop)
+   true_cut = int(true_cut)
+
+   line_d = {'id':id, 'chr':chr, 'strand':strand, 'seq':seq, 'splitpos':splitpos,\
+   'read_size':read_size, 'prb':prb, 'cal_prb':cal_prb, 'chastity':chastity, 'gene_id':gene_id,\
+   'p_start':p_start, 'exon_stop':exon_stop, 'exon_start':exon_start,\
+   'p_stop':p_stop,'true_cut':true_cut}
+
+   return line_d