####################################################################################
#                                                                                  #
#                       SIMPLE USE OF POSITIONAL PRIORS                            #
#                   --------------------------------------                         #
#  This protocol demonstrates how some simple numeric tracks, where the value in   #
#  each position correlates with the probability of observing a binding site in    #
#  that position, can be used as a "positional priors" track that can guide motif  #
#  discovery programs towards the target motifs. Tracks such as Conservation,      #
#  DNase hypersensitivity tracks and ChIP-seq tracks can be used either directly   #
#  or with minimal processing (depending on the motif discovery method used).      #
#  The protocol also demonstrates how operations can be used to manually create    #
#  a positional priors track with a specific search focus, namely searching for    #
#  additional motifs in neighborhood regions around known binding sites.           #
#                                                                                  #
####################################################################################

# The dataset is based on 61 sequences that should contain binding sites for the transcription factor NRSF (REST)
# according to ChIP-seq experiments performed in K562 cells. Each sequence in the dataset is 2000bp long.
AllSequences = new Sequence Collection(File:"http://tare.medisin.ntnu.no/motiflab/exampledata/NRSF_K562_61sequences.txt",format=Location)

DNA = new DNA Sequence Dataset(DataTrack:DNA)

# First we load a few NRSF motifs from TRANSFAC PRO so we can scan for matches and use the predicted sites as a comparison
NRSF = new Motif Collection(File:"http://tare.medisin.ntnu.no/motiflab/exampledata/NRSF.mlx", format=MotifLabMotif)

# We scan with a sensitive threshold but only keep the highest scoring site in each sequence.
# The matches found are probably correct since we know that the sequences should contain NRSF sites
# and the NRSF motifs have very high IC so they are unlikely to match the sequences purely by chance.
NRSF_sites = motifScanning on DNA with SimpleScanner {Motif Collection=NRSF,Threshold type="Percentage",Threshold=75,Score="Absolute"}
NRSF_sites_maximum_score = statistic "maximum score" in NRSF_sites
filter NRSF_sites where region's score < NRSF_sites_maximum_score

# Here we load a track containing ChIP-Seq peak regions for NRSF.
# We convert the track from a region dataset to a numeric track so we can use it as a positional priors track later.
NRSF_ChIPSeq_peak = new Region Dataset(DataTrack:NRSF_K562_ChipSeq_Peak_rep1)
convert NRSF_ChIPSeq_peak to numeric with value=1.0

# We also load a track containing the raw ChIP-seq signal to compare the performance of this track with the peak regions track.
NRSF_ChIPSeq_rawSignal = new Numeric Dataset(DataTrack:NRSF_K562_ChipSeq_rawSignal_rep1)

# Depending on which motif discovery method we use, we might have to perform some processing on the priors tracks.
# For example, in tracks such as "Conservation" the value in each position could be interpreted as a probability that
# the position is INSIDE a binding site (assuming a direct relationship between conservation and TFBS).
# However, most motif discovery methods interpret the value of a position in a priors track as a prior probability
# that a binding site would START in that position. To shift the emphasis towards the start of sites, we can use
# the "apply" operation with a "Sum" window to replace the value in each position with the sum of values
# in multiple following positions (for narrow peaks in a track, this will result in higher values at the start of the
# peak and lower values towards the end).
NRSF_ChIPSeq_processedSignal = apply Sum window of size 10 with anchor at start to NRSF_ChIPSeq_rawSignal

# Some methods (like MEME) complain if any of the positions have a value of 0 (or negative values)
# so we use the "normalize" operation to rescale the values. The following command will keep the maximum
# value in each sequence as it is, but the values between 0 and that value will be scaled so that the
# previous 0 value will be lifted to 0.001.
normalize NRSF_ChIPSeq_processedSignal from range [0,sequence.max] to range [0.001,sequence.max]

# Some methods (again MEME) also require that the values within each sequence represent a probability distribution,
# which means that the values sum to 1.0 (or less). We can use the "normalize sequence sum to one" operation for this.
normalize NRSF_ChIPSeq_processedSignal sequence sum to one

# We also load a conservation track to see if that will make a better positional prior than the ChIP-seq tracks
Conservation = new Numeric Dataset(DataTrack:Conservation)

# Now we perform motif discovery in the dataset using ChIPMunk guided by different positional priors tracks
# The first run (baseline) does not rely on a positional prior
[ChIPMunk_baseline,Motifs_ChIPMunk_baseline] = motifDiscovery on DNA with ChIPMunk {Model="Simple",Min motif size=7,Max motif size=25,Occurrences="OOPS",Motif shape="flat"} motif-prefix="ChIPMunk"
[ChIPMunk_peaks,Motifs_ChIPMunk_peaks] = motifDiscovery on DNA with ChIPMunk {Model="Peak",Min motif size=7,Max motif size=25,Occurrences="OOPS",Peaks=NRSF_ChIPSeq_peak,Motif shape="flat"} motif-prefix="ChIPMunkPeak"
[ChIPMunk_raw,Motifs_ChIPMunk_raw] = motifDiscovery on DNA with ChIPMunk {Model="Peak",Min motif size=7,Max motif size=25,Occurrences="OOPS",Peaks=NRSF_ChIPSeq_rawSignal,Motif shape="flat"} motif-prefix="ChIPMunkRaw"
[ChIPMunk_processed,Motifs_ChIPMunk_processed] = motifDiscovery on DNA with ChIPMunk {Model="Peak",Min motif size=7,Max motif size=25,Occurrences="OOPS",Peaks=NRSF_ChIPSeq_processedSignal,Motif shape="flat"} motif-prefix="ChIPMunkProcessed"
[ChIPMunk_Conservation,Motifs_ChIPMunk_Conservation] = motifDiscovery on DNA with ChIPMunk {Model="Peak",Min motif size=7,Max motif size=25,Occurrences="OOPS",Peaks=Conservation,Motif shape="flat"} motif-prefix="ChIPMunkCons"

# Finally, we compare the performance of the motif discovery runs
Analysis1_benchmark = analyze benchmark {Answer=NRSF_sites,Aggregate=false,Site overlap=0.5}
Output1 = output Analysis1_benchmark in HTML format {Metrics="Sn,sSN,PPV,sPPV,CC",X-axis="Method",Use abbreviations=true,Graph scale=100,Color boxes=false}


# In the following example we will use multiple operations to manually create a positional priors track with a specific focus.
# This track will focus the search for motifs to regions in the neighborhood of the NRSF sites we discovered previously

# First we define a "neighborhood" region around the NRSF sites (in the 'ChIPMunk_processed' track)
# by extending these binding sites by 60bp in both directions
Extended_regions = extend ChIPMunk_processed by 60

# We convert this "neighborhood" track to a numeric track
convert Extended_regions to numeric with value=1.0

# To include information about conservation in our new positional priors, we can e.g. increase the value
# of the priors by the conservation score within the neighborhood regions
increase Extended_regions by Conservation where Extended_regions > 0

# Since we do not want to rediscover the NRSF sites, we set the positional priors track to 0 inside these NRSF regions
# (to be on the safe side we also set the value to 0 in 8bp flanking regions around the sites).
# Note that tracks with names starting with underscore (like "_Extended8" below) are considered as "temporary" data objects
# and will only exist for the duration of the protocol. They will not be shown in the GUI and they will
# be deleted automatically by MotifLab after the protocol execution is complete
_Extended8 = extend ChIPMunk_processed by 8
set Extended_regions to 0 where inside _Extended8

[BindingSites_ChIPMunk1,Motifs_ChIPMunk1] = motifDiscovery on DNA with ChIPMunk {Model="Peak",Min motif size=7,Max motif size=19,Occurrences="0.5",Peaks=Extended_regions,Motif shape="flat"} motif-prefix="ChIPMunk"