import pdb
import os
import os.path
-import math
import resource
from qpalma.computeSpliceWeights import *
from ParaParser import *
-from Lookup import Lookup
+from qpalma.Lookup import LookupTable
from qpalma.sequence_utils import reverse_complement,unbracket_seq
-
class BracketWrapper:
- fields = ['id', 'chr', 'pos', 'strand', 'mismatches', 'length',\
- 'offset', 'seq', 'prb', 'cal_prb', 'chastity']
+ #fields = ['id', 'chr', 'pos', 'strand', 'mismatches', 'length',\
+ #'offset', 'seq', 'prb', 'cal_prb', 'chastity']
+ fields = ['id', 'chr', 'pos', 'strand', 'seq', 'prb']
def __init__(self,filename):
- self.parser = ParaParser("%lu%d%d%s%d%d%d%s%s%s%s",self.fields,len(self.fields),IN_VECTOR)
+ self.parser = ParaParser("%lu%d%d%s%s%s",self.fields,len(self.fields),IN_VECTOR)
self.parser.parseFile(filename)
def __len__(self):
run = cPickle.load(open(run_fname))
self.run = run
- dna_flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
-
-
start = cpu()
-
# old version
#self.all_remapped_reads = parse_map_vm_heuristic(data_fname)
self.all_remapped_reads = BracketWrapper(data_fname)
-
stop = cpu()
print 'parsed %d reads in %f sec' % (len(self.all_remapped_reads),stop-start)
+ #g_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/contigs'
+ #acc_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/acc'
+ #don_dir = '/fml/ag-raetsch/home/fabio/tmp/Lyrata/splice_scores/don'
+
+ #g_fmt = 'contig%d.dna.flat'
+ #s_fmt = 'contig_%d%s'
+
+ #self.chromo_range = range(0,1099)
+
+ g_dir = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
+ acc_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/acc'
+ don_dir = '/fml/ag-raetsch/home/fabio/tmp/interval_query_files/don'
+
+ g_fmt = 'chr%d.dna.flat'
+ s_fmt = 'contig_%d%s'
+
+ self.chromo_range = range(1,6)
start = cpu()
- self.lt1 = Lookup(dna_flat_files)
+ self.lt1 = LookupTable(g_dir,acc_dir,don_dir,g_fmt,s_fmt,self.chromo_range)
stop = cpu()
+
print 'prefetched sequence and splice data in %f sec' % (stop-start)
self.result_spliced_fh = open('%s.spliced'%result_fname,'w+')
self.mmatrix = mmatrix
self.qualityPlifs = qualityPlifs
- self.read_size = 36
+ self.read_size = 38
# parameters of the heuristics to decide whether the read is spliced
#self.splice_thresh = 0.005
chr = location['chr']
pos = location['pos']
strand = location['strand']
- mismatch = location['mismatches']
- length = location['length']
- off = location['offset']
+ #mismatch = location['mismatches']
+ #length = location['length']
+ #off = location['offset']
seq = location['seq']
prb = location['prb']
- cal_prb = location['cal_prb']
- chastity = location['chastity']
+ #cal_prb = location['cal_prb']
+ #chastity = location['chastity']
id = int(id)
strand = strand_map[strand]
- if not chr in range(1,6):
+ if not chr in self.chromo_range:
continue
unb_seq = unbracket_seq(seq)
# forgot to do this
if strand == '-':
- pos = self.lt1.seqInfo.chromo_sizes[chr+7]-pos-self.read_size
+ pos = self.lt1.seqInfo.chromo_sizes[chr]-pos-self.read_size
unb_seq = reverse_complement(unb_seq)
effective_len = len(unb_seq)
exons[1,0] = effective_len
est = unb_seq
original_est = seq
- quality = prb
+ quality = map(lambda x:ord(x)-50,prb)
#pdb.set_trace()
strand = location['strand']
original_est = location['seq']
quality = location['prb']
- cal_prb = location['cal_prb']
+ quality = map(lambda x:ord(x)-50,quality)
+ #cal_prb = location['cal_prb']
original_est = original_est.lower()
est = unbracket_seq(original_est)
##########
-def split_file(filename,result_dir,parts):
+def split_file(filename,result_dir,num_parts):
+ """
+ Splits a file of n lines into num_parts parts
+ """
+
jp = os.path.join
all_intervals = []
print 'found %d lines' % line_ctr
- part = line_ctr / parts
+ part_size = line_ctr / num_parts
begin = 0
end = 0
- for idx in range(1,parts+1):
-
- if idx == parts:
- begin = end
- end = line_ctr
+
+ for idx in range(1,num_parts+1):
+ begin = end
+
+ if idx == num_parts:
+ end = line_ctr
else:
- begin = end
- end = begin+part
+ end = begin+part_size+1
- params = (begin,end)
- all_intervals.append(params)
+ all_intervals.append((begin,end))
parts_fn = []
for pos,params in enumerate(all_intervals):
lineCtr = 0
beg = -1
end = -1
- for line in open(filename,'r'):
+ in_fh = open(filename,'r')
+ while True:
+ line = in_fh.readline()
+ if line == '':
+ break
+
if beg <= lineCtr < end:
out_fh.write(line)
lineCtr += 1
else:
- params = all_intervals.pop()
- beg,end = params
+ (beg,end) = all_intervals.pop()
out_fn = parts_fn.pop()
out_fh.close()
out_fh = open(out_fn,'w+')
+ out_fh.write(line)
out_fh.close()
if __name__ == '__main__':
- split_file_join_results('/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/map.vm',10)
+ split_file('/fml/ag-raetsch/home/fabio/tmp/lyrata_analysis/map.vm','/tmp',25)
+
+
def create_and_submit():
jp = os.path.join
+ num_splits = 25
+
run_dir = '/fml/ag-raetsch/home/fabio/tmp/newest_run/alignment/run_enable_quality_scores_+_enable_splice_signals_+_enable_intron_length_+'
- data_dir = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/run_0'
+ #data_dir = '/fml/ag-raetsch/home/fabio/tmp/lyrata_analysis/'
+
+ data_dir = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/main'
+
run_fname = jp(run_dir,'run_obj.pickle')
- original_map_fname = '/media/oka_raid/backup/data/solexa_analysis/ATH/Transcriptome/Col-0/run_44/4/length_38/main/map.vm'
- #split_file(original_map_fname,data_dir,25)
+
+ #original_map_fname = '/fml/ag-raetsch/home/fabio/tmp/lyrata_analysis/map.vm'
+ #original_map_fname = '/fml/ag-raetsch/home/fabio/tmp/transcriptome_data/new_map.vm'
+ original_map_fname = '/fml/ag-raetsch/home/fabio/tmp/vmatch_evaluation/main/map.vm'
+ split_file(original_map_fname,data_dir,num_splits)
param_fname = jp(run_dir,'param_526.pickle')
functionJobs=[]
- for idx in range(3,25):
+ for idx in range(0,num_splits):
data_fname = jp(data_dir,'map.part_%d'%idx)
result_fname = jp(data_dir,'map.vm.part_%d.heuristic'%idx)
current_job = KybJob(grid_heuristic.g_heuristic,[run_fname,data_fname,param_fname,result_fname])
current_job.h_vmem = '25.0G'
- current_job.express = 'True'
+ #current_job.express = 'True'
print "job #1: ", current_job.nativeSpecification
functionJobs.append(current_job)
+ #break
(sid, jobids) = submit_jobs(functionJobs)
#print 'checking whether finished'