import qpalma.tools
from qpalma.parsers import *
-from qpalma.sequence_utils import *
+from qpalma.sequence_utils import SeqSpliceInfo,DataAccessWrapper,get_flat_file_size,reverse_complement,unbracket_seq,create_bracket_seq
import QPalmaConfiguration as Conf
strand_map = ['-','+']
instance_counter = 0
+ g_dir = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+ acc_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
+ don_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
+
+ g_fmt = 'chr%d.dna.flat'
+ s_fmt = 'contig_%d%s'
+
+ num_chromo = 6
+
+ accessWrapper = DataAccessWrapper(g_dir,acc_dir,don_dir,g_fmt,s_fmt)
+ seqInfo = SeqSpliceInfo(accessWrapper,range(1,num_chromo))
+
for line in open(map_file):
- if instance_counter > 0 and instance_counter % 50000 == 0:
+ if instance_counter > 0 and instance_counter % 5000 == 0:
print 'processed %d examples' % instance_counter
-
- #if instance_counter == 10:
- # break
+ #if instance_counter == 10000:
+ # break
slist = line.split()
strand = slist[3]
strand = strand_map[strand == 'D']
- if first_map:
- read_seq = slist[7]
- else:
- read_seq = slist[11]
+ read_seq = slist[4]
# QPalma uses lowercase characters
read_seq = read_seq.lower()
+ read_seq = unbracket_seq(read_seq)
- # fetch the three quality vectors
- _prb = slist[8]
- _cal_prb = slist[9]
- _chastity = slist[10]
+ # fetch the prb quality vector
+ _prb = slist[5]
# for the time being we only consider chromosomes 1-5
if not chromo in range(1,6):
continue
+ if strand == '-':
+ pos = seqInfo.chromo_sizes[chromo]-pos-self.read_size
+ read_seq = reverse_complement(read_seq)
+
# we use an area of +/- 1500 nucleotides around the seed position
if pos > 1501:
ds_offset = 1500
genomicSeq_start = pos - ds_offset
genomicSeq_stop = pos + us_offset
- # fetch missing information from original reads
- flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
-
- try:
- dna_seq = get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,flat_files,True)
- except:
- pdb.set_trace()
+ #try:
+ dna_seq = seqInfo.get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,True)
+ #except:
+ # pdb.set_trace()
# Take into account that the results of the second run do not contain
# the original reads anymore.
# qualities. Each quality element can range as follows: -128 <= elem <= 127
prb = array.array('b',[0]*self.read_size)
#cal_prb = array.array('b',[0]*self.read_size)
- chastity = array.array('b',[0]*self.read_size)
+ #chastity = array.array('b',[0]*self.read_size)
# convert the ascii-encoded quality scores
for idx in range(self.read_size):
prb[idx] = ord(_prb[idx])-50
#cal_prb[idx] = ord(_cal_prb[idx])-64
- chastity[idx] = ord(_chastity[idx])+10
+ #chastity[idx] = ord(_chastity[idx])+10
# add instance to set
currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
- currentQualities = (prb,chastity)
+ currentQualities = (prb)
# as one single read can have several vmatch matches we store all
# these matches under the unique id of the read
# -*- coding: utf-8 -*-
import os.path
+import cProfile
from compile_dataset import DatasetGenerator
-jp = os.path.join
-#working_dir='/fml/ag-raetsch/home/fabio/tmp/transcriptome_data'
-#working_dir='/fml/ag-raetsch/home/fabio/tmp/transcriptome_data'
-#working_dir='/fml/ag-raetsch/share/projects/qpalma/solexa/new_run2/mapping/spliced'
+def run():
+ jp = os.path.join
-main_dir = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_0/'
+ main_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/main'
-#spliced_dir = '/media/oka_raid/backup/data/solexa_analysis/ATH/Transcriptome/Col-0/run_44/4/length_38/spliced'
-#result_dir = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_1/'
+ spliced_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1'
+ result_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/spliced_1/dataset'
-spliced_dir = '/media/oka_raid/backup/data/solexa_analysis/ATH/Transcriptome/Col-0/run_44/4/length_38/spliced_3'
-result_dir = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_2/'
+ map_1_fn = jp(main_dir,'map.vm.spliced')
+ map_2_fn = jp(spliced_dir,'map.vm')
-map_1_fn = jp(main_dir,'map.vm.spliced')
-map_2_fn = jp(spliced_dir,'map.vm')
+ dg = DatasetGenerator(map_1_fn,map_2_fn)
+ dg.compile_dataset()
+ dg.saveAs(jp(result_dir,'dataset_run_1.pickle'))
-dg = DatasetGenerator(map_1_fn,map_2_fn)
-dg.compile_dataset()
-dg.saveAs(jp(result_dir,'dataset_run_2.pickle'))
+if __name__ == '__main__':
+ #cProfile.run('run()')
+ run()