--- /dev/null
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# Written (W) 2008 Fabio De Bona
+# Copyright (C) 2008 Max-Planck-Society
+
+import array
+import cPickle
+import os
+import pdb
+
+#from parsers import *
+from sequence_utils import SeqSpliceInfo,DataAccessWrapper,get_flat_file_size,reverse_complement,unbracket_seq,create_bracket_seq,reconstruct_dna_seq
+
+
+def generateDataset(global_settings):
+ """
+ This function is responsible for the dataset generation for the matches of
+ the second vmatch run and the heuristic-filtered matches of the first run.
+
+ An example in our dataset consists of:
+
+ - information on the original read:
+ * id
+ * nucleotide sequence
+ * quality vectors (at the moment: prb and chastity)
+
+ - information on the target dna sequence:
+ * chromosome
+ * strand
+ * fragment start
+ * fragment end positions
+
+ this information is stored in tuples which are stored then in a dictionary
+ indexed by the reads unique id.
+
+ CAUTION: The dictionary stores a list of the above defined tuples under each
+ 'id'. This is because one read can be found several times on the genome.
+ """
+
+ assert os.path.exists(map_file), 'Error: Can not find map file'
+ print map_2nd_file
+ assert os.path.exists(map_2nd_file), 'Error: Can not find map_2nd file'
+ self.map_file = map_file
+ self.map_2nd_file = map_2nd_file
+
+ dataset = {}
+
+ prb_offset = 64
+
+ # this means that around each vmatch hit we cut out a window of 1500 bases
+ # each up-/downstream.
+ half_window_size = 1500
+
+ instance_counter = 0
+
+ accessWrapper = DataAccessWrapper(settings)
+ seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
+
+ for line in open(map_file):
+ if instance_counter > 0 and instance_counter % 5000 == 0:
+ print 'processed %d examples' % instance_counter
+
+ slist = line.split()
+
+ id = int(slist[0])
+ chromo = int(slist[1])
+ pos = int(slist[2])
+
+ # for the time being we only consider chromosomes 1-5
+ if not chromo in range(1,6):
+ continue
+
+ # convert D/P strand info to +/-
+ strand = ['-','+'][slist[3] == 'D']
+
+ # QPalma uses lowercase characters
+ bracket_seq = slist[4].lower()
+ read_seq = unbracket_seq(bracket_seq)
+ reconstructed_dna_seq = reconstruct_dna_seq(bracket_seq)
+
+ if strand == '-' and first_round:
+ read_seq = reverse_complement(read_seq)
+ #reconstructed_dna_seq = reverse_complement(reconstructed_dna_seq)
+
+ assert not '[' in read_seq and not ']' in read_seq
+
+ # we use an area of +/- `self.half_window_size` nucleotides around the seed position
+ if pos > self.half_window_size+1:
+ us_offset = self.half_window_size
+ else:
+ us_offset = pos - 1
+
+ if pos+self.half_window_size < seqInfo.chromo_sizes[chromo]:
+ ds_offset = self.half_window_size
+ else:
+ ds_offset = seqInfo.chromo_sizes[chromo]-pos-1
+
+ genomicSeq_start = pos - us_offset
+ genomicSeq_stop = pos + ds_offset
+
+ dna_seq = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,True)
+
+ #if strand == '-':
+ # dna_part = dna_seq[us_offset-len(read_seq):us_offset]
+ #else:
+ # dna_part = dna_seq[us_offset:us_offset+len(read_seq)]
+
+ ## check whether everything is ok
+ # try:
+ # currentDNASeq, currentAcc, currentDon = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop)
+ # except:
+ # problem_ctr += 1
+ # continue
+
+ # In order to save some space we use a signed char to store the
+ # qualities. Each quality element can range as follows: -128 <= elem <= 127
+
+ prb = array.array('b',map(lambda x: ord(x)-self.prb_offset,slist[5]))
+
+ currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
+ currentQualities = [prb]
+
+ # As one single read can have several vmatch matches we store all these
+ # matches under the unique id of the read
+ dataset.setdefault(id, []).append((currentSeqInfo,read_seq,currentQualities))
+
+ instance_counter += 1
+
+ print 'Total examples processed: %d' % instance_counter
+
+ ddir = global_settings['dataset_dir']
+ dataset_fn = jp(ddir,'data.pickle')
+ dataset_keys_fn = jp(ddir,'data.keys.pickle')
+
+ assert not os.path.exists(dataset_fn), 'The data_file already exists!'
+ assert not os.path.exists(dataset_keys_fn), 'The data_keys file already exists!'
+
+ # saving new dataset and single keys as well
+ cPickle.dump(dataset,open(dataset_fn,'w+'),protocol=2)
+ cPickle.dump(dataset.keys(),open(dataset_keys_fn,'w+'),protocol=2)
data_fname = jp(result_dir,'map.part_%d'%idx)
result_fname = jp(result_dir,'map.vm.part_%d.heuristic'%idx)
- current_job = KybJob(gridtools.ApproximationTaskStarter,[run_fname,data_fname,param_fname,result_fname])
+ current_job = KybJob(gridtools.ApproximationTaskStarter,[run_fname,data_fname,param_fname,result_fname,self.global_settings])
current_job.h_vmem = '25.0G'
#current_job.express = 'True'
self.functionJobs.append(current_job)
-def ApproximationTaskStarter(run_fname,data_fname,param_fname,result_fname):
- ph1 = PipelineHeuristic(run_fname,data_fname,param_fname,result_fname)
+def ApproximationTaskStarter(run_fname,data_fname,param_fname,result_fname,settings):
+ ph1 = PipelineHeuristic(run_fname,data_fname,param_fname,result_fname,settings)
ph1.filter()
return 'finished filtering set %s.' % data_fname
vmatch is spliced an should be then newly aligned using QPalma or not.
"""
- def __init__(self,run_fname,data_fname,param_fname,result_fname):
+ def __init__(self,run_fname,data_fname,param_fname,result_fname,settings):
"""
We need a run object holding information about the nr. of support points
etc.
print 'parsed %d reads in %f sec' % (len(self.all_remapped_reads),stop-start)
- #g_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/contigs'
- #acc_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/acc'
- #don_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/don'
+ self.chromo_range = settings['allowed_fragments']
- #g_fmt = 'contig%d.dna.flat'
- #s_fmt = 'contig_%d%s'
-
- #self.chromo_range = range(0,1099)
-
- g_dir = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
- acc_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
- don_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
-
- g_fmt = 'chr%d.dna.flat'
- s_fmt = 'contig_%d%s'
-
- self.chromo_range = range(1,6)
- #self.chromo_range = range(1,2)
-
- num_chromo = 2
- accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
- self.seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
+ accessWrapper = DataAccessWrapper(settings)
+ seqInfo = SeqSpliceInfo(accessWrapper,settings['allowed_fragments'])
start = cpu()
self.lt1 = LookupTable(g_dir,acc_dir,don_dir,g_fmt,s_fmt,self.chromo_range)