##' Pre-processes BED files with tag data ##' Makes coverage files (1 pr chromosome) describing tag counts along the chromosomes. ##' options(stringsAsFactors=FALSE) library(IRanges) library(yaml) ##' Runs preprocessing according to configuration file. ##' ##' If configPath is NULL, the readPath, coverPath and readWidth are taken from the function arguments. ##' @title preprocess ##' @param configPath Path to a configuration file in YAML format (see config.yml), or NULL. ##' @param readPath Path to BED files with reads. ##' @param coverPath Path for coverage files (preprocessing output). ##' @param readWidth Read width (w) ##' @return preprocess <- function(configPath="./config.yml", params=list()){ if (! is.null(configPath)){ message("Using config file ", configPath) config <- yaml.load_file(configPath) READ.PATH <- config$READ.PATH COVER.PATH <- config$COVER.PATH READ.WIDTH <- config$READ.WIDTH } for (i in seq_along(params)){ assign(names(params)[i], params[[i]]) } makeRangedData(READ.PATH) if (! file.exists(COVER.PATH)){ dir.create(COVER.PATH, recursive=TRUE) } makeChromosomeCoverFiles(filePath=READ.PATH, gapped.width=READ.WIDTH, outputPath=COVER.PATH) mergeChromosomeCoverFiles(filePath=COVER.PATH) } ##' Converts BED files with position of mapped reads to IRanges RangedData objects ##' ##' ##' @title makeRangedData ##' @param filePath The file path for the BED files ##' @return makeRangedData <- function(filePath="."){ files <- list.files(path = filePath, pattern = "bed$") for (file in files){ message("Converting BED to RangedData for file ", file.path(filePath, file)) dfr <- read.delim(file.path(filePath, file), header=FALSE, colClasses=c("character", rep("integer",2), rep("NULL",2), "character")) colnames(dfr) <- c("space", "start", "end", "strand") dfr <- with(dfr, dfr[order(space, start, end), ]) rd <- as(dfr, "RangedData") save(rd, file=file.path(filePath, sub("bed$", "RData", file))) } } ##' Makes files describing tag coverage for each chromosome ##' ##' Each file is split on chromosomes and the coverage pr strand is calculated in intervals ##' @title makeChromosomeCoverFiles ##' @param filePath The file path for the files ##' @param filePattern The name pattern for RangedData files, ending with RData ##' @param gapped.width The width of each coverage interval ##' @return makeChromosomeCoverFiles <- function(filePath=".", filePattern="RData$", gapped.width=100, outputPath="./chrcovers"){ rdfiles <- list.files(path=filePath, pattern=filePattern) for (file in rdfiles){ message("Making chromosome coverage files for ", file) load(file.path(filePath, file)) for (chr in names(rd)){ if (grepl("[_M]", chr)) next # Ignore chromosome M strand <- values(rd)[[chr]]$strand y <- ranges(rd)[[chr]] ly <- split(y, strand) lsize <- lapply(ly, length) lstart <- lapply(ly, start) lwidth <- lapply(ly, width) lgap <- lapply(lwidth, function(w) floor((gapped.width-w)/2)) irpos <- IRanges(start=lstart[["+"]] - lgap[["+"]], width=lwidth[["+"]] + 2*lgap[["+"]]) irneg <- IRanges(start=lstart[["-"]] - lgap[["-"]], width=lwidth[["-"]] + 2*lgap[["-"]]) irl <- IRangesList("-"=irneg, "+"=irpos) lcvg <- coverage(irl) covers <- list(SIZE=lsize, CVG=lcvg) save(covers, file=file.path(outputPath, paste(chr, "_", file, sep=""))) } } } ##' Merges/pools chromosome coverage files pr chromosome ##' ##' Makes one file pr chromosome, containing a named list of all files merged. ##' Each list item has the coverage for each strand, according to that file. ##' @title mergeChromosomeCoverFiles ##' @param filePath Path to coverage files ##' @param filePattern File name pattern of coverage files ##' @return mergeChromosomeCoverFiles <- function(filePath="./chrcovers", filePattern="RData$"){ allfiles <- list.files(filePath, pattern=filePattern, full.names=TRUE) chrfiles <- allfiles[grepl(".+/chr([^_]+)_", allfiles)] chrs <- unique(sub(".+/([^_]+)_.+", "\\1", chrfiles)) for (chr in chrs){ message("Merging coverage files for chromosome ", chr) chrcovers <- list() files <- chrfiles[grep(paste(chr, "_", sep=""), chrfiles)] for (file in files){ cat(file, '\n') load(file) cat("Loaded", file, '\n') target <- sub("[^_]+_(.+).RData", "\\1", file) chrcovers[[target]] <- covers } save(chrcovers, file=file.path(filePath, paste(chr, ".RData", sep=""))) } }