--- /dev/null
+// use macro for GNU getline
+#define _GNU_SOURCE
+#include <stdio.h>
+
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+
+/*
+ * This program compares two lists of intron positions stemming for example from
+ * a gff file in order to evaluate predictions made by QPalma.
+ *
+ * It is assumed that the first file contains the ground truth and the second
+ * file the predicted positions.
+ *
+ */
+
+int* gt_intron_starts;
+int* gt_intron_stops;
+int gt_size;
+
+int* pred_intron_starts;
+int* pred_intron_stops;
+int pred_size;
+
+int matching_introns;
+
+static char* info = "Usage: ./<app name> <input 1 file name> <input 2 file name> <result file name>";
+
+
+/*
+ * This function compares the intron boundaries of the two files.
+ *
+ */
+
+void compare_introns() {
+
+ matching_introns = 0;
+
+ int gt_idx = 0;
+ int pred_idx = 0;
+ int gt_start,gt_stop,pred_start,pred_stop;
+
+ printf("sizes are %d/%d\n",gt_size,pred_size);
+
+ while(1) {
+ if (gt_idx == gt_size || pred_idx == pred_size)
+ break;
+
+ gt_start = gt_intron_starts[gt_idx];
+ gt_stop = gt_intron_stops[gt_idx];
+
+ pred_start = pred_intron_starts[pred_idx];
+ pred_stop = pred_intron_stops[pred_idx];
+
+ //printf("pos: %d/%d %d/%d\n",gt_start,gt_stop,pred_start,pred_stop);
+ if (gt_start == pred_start && gt_stop == pred_stop) {
+ matching_introns++;
+ gt_idx++;
+ pred_idx++;
+ continue;
+ }
+
+ // check for all possible overlappings / positions
+ if ( gt_stop <= pred_start ) {
+ gt_idx++;
+ continue;
+ }
+
+ if ( pred_stop <= gt_start ) {
+ pred_idx++;
+ continue;
+ }
+
+ if (gt_stop <= pred_stop && gt_stop >= pred_start) {
+ gt_idx++;
+ continue;
+ }
+
+ if (pred_stop <= gt_stop && pred_stop >= gt_start) {
+ pred_idx++;
+ continue;
+ }
+ }
+}
+
+
+/*
+ *
+ *
+ *
+ */
+
+void load_introns(char* fn, int** starts, int** stops, int* size) {
+
+ FILE *fs = fopen(fn,"r");
+ if(fs == NULL) {
+ printf("Error: Could not open file: %s",fn);
+ exit(EXIT_FAILURE);
+ }
+
+ size_t line_size = 256;
+ char* current_line = malloc(sizeof(char)*line_size);
+ size_t status;
+
+ // we count the number of lines the file has
+ int line_ctr = 0;
+ while( getline(¤t_line,&line_size,fs) != -1 ) line_ctr++;
+
+
+ *size = line_ctr;
+ // now we allocate memory to store all positions
+ *starts = malloc(sizeof(int)*line_ctr);
+ *stops = malloc(sizeof(int)*line_ctr);
+
+ if ( *starts == NULL || *stops == NULL )
+ perror("Could not allocate memory for position arrays");
+
+ fs = freopen(fn,"r",fs);
+ if(fs == NULL) {
+ printf("Error: Could not open file: %s",fn);
+ exit(EXIT_FAILURE);
+ }
+
+ char* first_num = malloc(sizeof(char)*line_size);
+ char* second_num = malloc(sizeof(char)*line_size);
+
+ int idx;
+ int intron_start;
+ int intron_stop;
+ int ctr = 0;
+
+ while(1) {
+ status = getline(¤t_line,&line_size,fs);
+ if ( status == -1 )
+ break;
+
+ //printf("%s",current_line);
+
+ for(idx=0;idx<line_size;idx++) {
+ if (current_line[idx] == ' ')
+ break;
+ }
+ strncpy(first_num,current_line,idx);
+ strncpy(second_num,¤t_line[idx],line_size-idx);
+ intron_start = atoi(first_num);
+ intron_stop = atoi(second_num);
+ //printf("start/stop: %d/%d\n",intron_start,intron_stop);
+
+ (*starts)[ctr] = intron_start;
+ (*stops)[ctr] = intron_stop;
+ ctr++;
+ }
+
+ printf("ctr is %d\n",ctr);
+ if (fclose(fs) != 0)
+ perror("Closing of filestream failed!");
+}
+
+int main(int argc, char* argv[]) {
+
+ if(argc != 4) {
+ printf("%s\n",info);
+ exit(EXIT_FAILURE);
+ }
+
+ int filenameSize = 256;
+ char* gt_fn = malloc(sizeof(char)*filenameSize);
+ char* pred_fn = malloc(sizeof(char)*filenameSize);
+ char* result_fn= malloc(sizeof(char)*filenameSize);
+
+ if ( gt_fn == NULL || pred_fn == NULL || result_fn == NULL)
+ perror("Could not allocate memory for filenames");
+
+ strncpy(gt_fn,argv[1],strlen(argv[1]));
+ strncpy(pred_fn,argv[2],strlen(argv[2]));
+ strncpy(result_fn,argv[3],strlen(argv[3]));
+
+ FILE *result_fs = fopen(result_fn,"r");
+ if(result_fs == NULL) {
+ printf("Error: Could not open file: %s",result_fn);
+ exit(EXIT_FAILURE);
+ }
+
+
+ load_introns(gt_fn,>_intron_starts,>_intron_stops,>_size);
+ load_introns(pred_fn,&pred_intron_starts,&pred_intron_stops,&pred_size);
+
+ compare_introns();
+
+ int f_status = fclose(result_fs);
+ if(f_status != 0)
+ printf("closing of gff filestream failed!\n");
+
+ free(gt_fn);
+ free(pred_fn);
+ free(result_fn);
+
+ printf("Found %d matching intron(s).\n",matching_introns);
+ exit(EXIT_SUCCESS);
+}
+
#!/usr/bin/env python
# -*- coding: utf-8 -*-
-import sys
import array
-import random
+import cPickle
import os
-import re
import pdb
-import cPickle
+import random
+import sys
import qpalma
import qpalma.tools
class DatasetGenerator:
+ """
+ This class wraps the dataset generation for the matches of the second vmatch
+ run and the heuristic-filtered matches of the first run.
+
+ An example in our dataset consists of:
+
+ - information on the original read:
+ * id
+ * nucleotide sequence
+ * quality vectors (at the moment: prb and chastity)
+
+ - information on the target dna sequence:
+ * chromosome
+ * strand
+ * fragment start
+ * fragment end positions
+
+ this information is stored in tuples which are stored then in a dictionary
+ indexed by the reads unique id.
+
+ CAUTION: The dictionary stores a list of the above defined tuples under each
+ 'id'. This is because one read can be found several times on the genome.
+ """
def __init__(self,map_file,map_2nd_file):
assert os.path.exists(map_file), 'Error: Can not find map file'
self.dataset = []
- self.read_size = 38
+ #self.read_size = 38
+ self.read_size = 36
def saveAs(self,dataset_file):
dataset_fn = '%s.pickle'% dataset_file
dataset_keys_fn = '%s.keys.pickle'%dataset_file
+ print dataset_fn
+ print dataset_keys_fn
+
assert not os.path.exists(dataset_fn), 'The data_file already exists!'
assert not os.path.exists(dataset_keys_fn), 'The data_keys file already exists!'
def parse_map_file(self,dataset,map_file,first_map):
- """
- """
-
strand_map = ['-','+']
instance_counter = 0
if instance_counter > 0 and instance_counter % 50000 == 0:
print 'processed %d examples' % instance_counter
+ #if instance_counter == 10:
+ # break
+
slist = line.split()
id = int(slist[0])
# Take into account that the results of the second run do not contain
# the original reads anymore.
if first_map:
- original_est = read_seq
+ original_read = read_seq
else:
- original_est = create_bracket_seq(dna_seq[ds_offset:ds_offset+self.read_size],read_seq)
+ original_read = create_bracket_seq(dna_seq[ds_offset:ds_offset+self.read_size],read_seq)
# in order to save some space we use a signed char to store the
# qualities. Each quality element can range as follows: -128 <= elem <= 127
# as one single read can have several vmatch matches we store all
# these matches under the unique id of the read
- dataset.setdefault(id, []).append((currentSeqInfo,original_est,currentQualities))
+ dataset.setdefault(id, []).append((currentSeqInfo,original_read,currentQualities))
instance_counter += 1
-
print 'Total examples processed: %d' % instance_counter
return dataset
# usually we have two files to parse:
# the map file from the second run and a subset of the map file from the
# first run
- dataset = self.parse_map_file(dataset,self.map_file,True)
+ #dataset = self.parse_map_file(dataset,self.map_file,True)
dataset = self.parse_map_file(dataset,self.map_2nd_file,False)
self.dataset = dataset