#################################################################################
#                                                                               #
#  This protocol will perform motif scanning and output a table showing         #
#  the number of times each motif is found in the sequences.                    #
#  An overrepresentation p-value is calculated for each motif by comparing      #
#  the motif's observed frequency in the sequences to its expected frequency    #
#  based on the number of times it is found in set of artificial DNA sequences  #
#  having the same oligonucleotide composition as the original sequences        #
#                                                                               #
#################################################################################

# Uncomment the next line (remove #-sign before 'AllSequences') to use a precompiled collection of example sequences
#AllSequences = new Sequence Collection(File:"http://tare.medisin.ntnu.no/motiflab/exampledata/MuscleGenes_-2000+200.txt",format=Location)

DNA = new DNA Sequence Dataset(DataTrack:DNA)

Motifs = new Motif Collection(Collection:Jaspar Core)
#Motifs = new Motif Collection(Collection:TRANSFAC Public)

# Perform motif scanning in the DNA sequence with a cutoff threshold selected by the user (default 95%)

cutoff = new Numeric Variable(95.0)
prompt for cutoff "Select motif scanning cutoff threshold [0-100]"
TFBS = motifScanning on DNA with SimpleScanner {Motif Collection=Motifs,Score="Absolute",Threshold type="Percentage",Threshold=cutoff}

# Create a background model based on the oligonucleotide frequency composition in the DNA track
# and then use this model to create a new artificial DNA track with the same frequency composition.
# By performing the same motif scanning step in this random DNA track, we can calculate expected
# occurrence frequencies for each motif. Note that the accuracy of this expected frequency estimation
# will depend on the total length of the DNA sequences used. More/longer sequences will result in more
# accurate estimations. If the total length of the DNA sequences is too short, many motifs will not
# appear at all in the random DNA and these will then be assigned default expected frequencies of 0.

BG_model_order = new Numeric Variable(3)
prompt for BG_model_order "Select Markov Model order for background model creation [0-5]"
BG_model = new Background Model(Track:DNA,Order=BG_model_order,Strand=Relative)

# The next line will create a randomized/scrambled version of the original DNA track by using the BG_model to randomly select new bases to replace the original base at each position
DNA_scrambled = mask DNA relative strand with BG_model
$hide(DNA_scrambled)
# perform motif scanning in the artificial sequences
TFBS_scrambled = motifScanning on DNA_scrambled with SimpleScanner {Motif Collection=Motifs,Score="Absolute",Threshold type="Percentage",Threshold=cutoff}
$hide(TFBS_scrambled)

# Now estimate expected occurrence frequencies for each motif based on the number of times they occur in the artificial sequences
Expected_Motif_Frequencies = new Motif Numeric Map(Track:TFBS_scrambled,property=Frequency)

# Count the number of times each motif occurs in the sequences and compare the observed motif frequencies
# to the expected frequencies in random DNA to calculate p-values

Analysis_Motif_Occurrences = analyze count motif occurrences {Motif track=TFBS,Motifs=Motifs,Background frequencies=Expected_Motif_Frequencies,Significance threshold=0.05,Bonferroni correction="All motifs"}

# Motif sequence logos are not included in the results table, since creating logo images for every motif can be quite resource consuming.
# However, if you want to include sequence logos in the output, change the
# value of the 'Sequence logos' parameter at the end of the next line to
# either "Shared images" or "New images" (instead of "No").

Motif_Count_Table = output Analysis_Motif_Occurrences in HTML format {Sort by="p-value",Include="All motifs",Logos="No"}