-Steps
------
+----------------
+Processing Steps
+----------------
+
+
+Data Generation
+---------------
The data was taken from
directories 1 to 3
-We combined the map.vm and map_2nd.vm for all these three dirs.
+We combined the map.vm and map_2nd.vm for all these three dirs using 'createFullMap.sh'
+
+the data was saved in dir /fml/ag-raetsch/home/fabio/tmp/transcriptome_data.
+
+
+QPalma Heuristic
+----------------
+
+Next we split up map.vm to make a parallel run of the QPalma heuristic in order
+to estimate the entries in map.vm that should be aligned by QPalma.
+
+We combine the results using 'combine_spliced_map_parts.sh'
+
+
+QPalma Dataset Generation
+-------------------------
+
+Once we have the map_2nd.vm and the map.vm.spliced files we create a QPalma
+dataset using 'createNewDataset.py'
+
+
+
+
+
+Coverage Number Estimation
+--------------------------
+
+For this purpose we need the raw fasta files for the original reads. In the
+directory 'raw_reads' there should be a file 'reads_0.fa' containing all
+original reads in fasta format.
+
+
+
+
+New Vmatch criteria:
+
+
+Old settings 1 mismatch:
+
+/media/oka_raid/backup/data/solexa_analysis/ATH/Transcriptome/Col-0/run_44/4/length_38/spliced
+
+
-fabio@congo:/fml/ag-raetsch/share/projects/qpalma/solexa/scripte/first_transcriptome_run$
+2 Mismatches (stepwise trimming -3 from 35 to 20)
+
+/media/oka_raid/backup/data/solexa_analysis/ATH/Transcriptome/Col-0/run_44/4/length_38/spliced_3
# -*- coding: utf-8 -*-
import sys
+import array
import random
import os
import re
import pdb
import cPickle
-#import numpy
-#from numpy.matlib import mat,zeros,ones,inf
-
import qpalma
import qpalma.tools
from qpalma.parsers import *
from qpalma.sequence_utils import *
-#from Genefinding import *
-#from genome_utils import load_genomic
-
import qpalma.Configuration as Conf
-def create_bracket_seq(dna_seq,read_seq):
- """
- This function takes a dna sequence and a read sequence and returns the
- bracket format of the match/mismatches i.e.
-
- dna : aaa
- read: aac
- is written in bracket notation: aa[ac]
- """
- assert len(dna_seq) == len(read_seq)
- return "".join(map(lambda x,y: ['[%s%s]'%(x,y),x][x==y],dna_seq,read_seq))
-
+#
+# This script takes as input the map.vm map_2nd.vm files and generates QPalma
+# dataset pickle files
+#
class DatasetGenerator:
- """
- """
-
def __init__(self,map_file,map_2nd_file):
assert os.path.exists(map_file), 'Error: Can not find map file'
assert os.path.exists(map_2nd_file), 'Error: Can not find map_2nd file'
self.dataset = []
+ self.read_size = 38
+
def saveAs(self,dataset_file):
dataset_fn = '%s.pickle'% dataset_file
def parse_map_file(self,dataset,map_file,first_map):
+ """
+ """
+
strand_map = ['-','+']
- instance_counter = 1
+ instance_counter = 0
for line in open(map_file):
- if instance_counter % 1001 == 0:
+ if instance_counter > 0 and instance_counter % 50000 == 0:
print 'processed %d examples' % instance_counter
- line = line.strip()
- slist = line.split()
- id = int(slist[0])
+ slist = line.split()
+
+ id = int(slist[0])
chromo = int(slist[1])
pos = int(slist[2])
+
+ # convert D/P strand info to +/-
strand = slist[3]
strand = strand_map[strand == 'D']
else:
read_seq = slist[11]
+ # QPalma uses lowercase characters
+ read_seq = read_seq.lower()
+
+ # fetch the three quality vectors
_prb = slist[8]
_cal_prb = slist[9]
_chastity = slist[10]
- if strand != '+':
- continue
-
+ # for the time being we only consider chromosomes 1-5
if not chromo in range(1,6):
continue
- genomicSeq_start = pos - 1500
- genomicSeq_stop = pos + 1500
+ # we use an area of +/- 1500 nucleotides around the seed position
+ if pos > 1501:
+ ds_offset = 1500
+ else:
+ ds_offset = pos-1
+
+ us_offset = 1500
+
+ genomicSeq_start = pos - ds_offset
+ genomicSeq_stop = pos + us_offset
# fetch missing information from original reads
flat_files = '/fml/ag-raetsch/share/projects/genomes/A_thaliana_best/genome/'
- original_info = get_seq_and_scores(chromo,strand,pos,pos+35,flat_files)
- dna_seq = original_info[0]
- #filteredRead = self.all_filtered_reads[id]
- #original_est = filteredRead['seq']
- original_est = create_bracket_seq(dna_seq,read_seq)
+ try:
+ dna_seq = get_seq_and_scores(chromo,strand,genomicSeq_start,genomicSeq_stop,flat_files,True)
+ except:
+ pdb.set_trace()
+
+ # Take into account that the results of the second run do not contain
+ # the original reads anymore.
+ if first_map:
+ original_est = read_seq
+ else:
+ original_est = create_bracket_seq(dna_seq[ds_offset:ds_offset+self.read_size],read_seq)
- for idx in range(read_size):
- prb[idx] = ord(_prb[idx])-50
- cal_prb[idx] = ord(_cal_prb[idx])-64
- chastity[idx] = ord(_chastity[idx])+10
+ # in order to save some space we use a signed char to store the
+ # qualities. Each quality element can range as follows: -128 <= elem <= 127
+ prb = array.array('b',[0]*self.read_size)
+ #cal_prb = array.array('b',[0]*self.read_size)
+ chastity = array.array('b',[0]*self.read_size)
+
+ # convert the ascii-encoded quality scores
+ for idx in range(self.read_size):
+ prb[idx] = ord(_prb[idx])-50
+ #cal_prb[idx] = ord(_cal_prb[idx])-64
+ chastity[idx] = ord(_chastity[idx])+10
# add instance to set
currentSeqInfo = (id,chromo,strand,genomicSeq_start,genomicSeq_stop)
- currentQualities = (prb,cal_prb,chastity)
+ currentQualities = (prb,chastity)
# as one single read can have several vmatch matches we store all
# these matches under the unique id of the read
- if dataset.has_key(id):
- old_entry = dataset[id]
- old_entry.append((currentSeqInfo,original_est,currentQualities))
- dataset[id] = old_entry
- else:
- dataset[id] = [(currentSeqInfo,original_est,currentQualities)]
+ dataset.setdefault(id, []).append((currentSeqInfo,original_est,currentQualities))
instance_counter += 1
- print 'Total examples processed: %d' % instance_counter
+ print 'Total examples processed: %d' % instance_counter
return dataset
dataset = self.parse_map_file(dataset,self.map_file,True)
dataset = self.parse_map_file(dataset,self.map_2nd_file,False)
- # store the full set
- self.testing_set = dataset
+ self.dataset = dataset