####################################################################################
#                                                                                  #
#                Benchmark Evaluation of Motif Discovery Methods                   #
#             -----------------------------------------------------                #
#                                                                                  #
#  This protocol will generate 20 random DNA sequences with specified length       #
#  based on a chosen background model and then plant up to 5 selected motifs at    #
#  random locations in these sequences. A few de novo motif discovery methods      #
#  will be run to predict the locations of the motifs in the sequences, and the    #
#  performance of the methods will be evaluated. Note that in order to use this    #
#  protocol you must have installed/configured the motif discovery methods used    #
#  here (or rewrite the protocol to use other methods instead).                    #
#                                                                                  #
####################################################################################

length = new Numeric Variable(500)
prompt for length "Please select a length for the 20 sequences that will be generated"
background = new Background Model(Model:Uniform)
prompt for background "Please select a background model for DNA sequence generation"
Sequence01 = new Sequence(length)
Sequence02 = new Sequence(length)
Sequence03 = new Sequence(length)
Sequence04 = new Sequence(length)
Sequence05 = new Sequence(length)
Sequence06 = new Sequence(length)
Sequence07 = new Sequence(length)
Sequence08 = new Sequence(length)
Sequence09 = new Sequence(length)
Sequence10 = new Sequence(length)
Sequence11 = new Sequence(length)
Sequence12 = new Sequence(length)
Sequence13 = new Sequence(length)
Sequence14 = new Sequence(length)
Sequence15 = new Sequence(length)
Sequence16 = new Sequence(length)
Sequence17 = new Sequence(length)
Sequence18 = new Sequence(length)
Sequence19 = new Sequence(length)
Sequence20 = new Sequence(length)
DNA = new DNA Sequence Dataset(background)

TRANSFAC_Public = new Motif Collection(Collection:TRANSFAC Public)

Motifs = new Motif Collection()
prompt for Motifs "Please select between 1 and 5 target motifs to plant in the sequences"

Motif_sizes = new Motif Numeric Map(Property:Size)
Number_of_motifs = extract "size" from Motifs as Numeric Variable
max_motif_size = extract "top value in Motifs" from Motif_sizes as Numeric Variable
min_motif_size = extract "bottom value in Motifs" from Motif_sizes as Numeric Variable

# just setting up a few datasets that might be required by different methods
uniform_priors = new Numeric Dataset(1)
normalize uniform_priors sum to one
!hide(uniform_priors)
GCcontent = extract "GC-content" from background as Numeric Variable


# plant the selected motifs in the DNA track to create the artificial benchmark dataset
[DNA,target_sites] = plant Motifs in DNA {Plant probability=1.0,Force plant=true,Min match=0.75,Max match=1.0,Reverse probability=0.5}

# Perform motif discovery with a few methods.
# Note that you have to have these methods installed and registered with MotifLab
# in order to use them.

# MotifSampler
[MotifSampler_predictions,Motifs_predicted_by_MotifSampler] = motifDiscovery on DNA with MotifSampler {Background=background,Width=max_motif_size,Number=Number_of_motifs,Max occurrences=0,Probability=0.5,Overlap=1,Strand=1} motif-prefix="MotifSampler"

# PRIORITY
[PRIORITY_predictions,Motifs_predicted_by_PRIORITY] = motifDiscovery on DNA with PRIORITY {Number of motifs=Number_of_motifs,Motif length=max_motif_size,Priors=uniform_priors,Background=background,Strand="Double",Allow 0 occurrences=true} motif-prefix="PRIORITY"

# AlignAce
[AlignACE_predictions,Motifs_predicted_by_AlignACE] = motifDiscovery on DNA with AlignACE {Columns=max_motif_size,Expect=Number_of_motifs,GC-background=GCcontent,Min pass=200,Undersample=1.0,Oversample=1.0} motif-prefix="AlignACE"

# BioProspector
[BioProspector_predictions,Motifs_predicted_by_BioProspector] = motifDiscovery on DNA with BioProspector {Motif width=max_motif_size,Top motifs to report=Number_of_motifs,Number of tries=40,Strand="Search both strands",Must occur in all="Yes",Report degenerate sites="Yes"} motif-prefix="BioProspector"

# MDscan
[MDscan_predictions,Motifs_predicted_by_MDscan] = motifDiscovery on DNA with MDscan {Motif width=max_motif_size,Top sequences to search=5,Top motifs to refine=30,Top motifs to report=Number_of_motifs,Iterations=10} motif-prefix="MDscan"

# Weeder
[Weeder_predictions,Motifs_predicted_by_Weeder] = motifDiscovery on DNA with Weeder {Analysis="medium",Organism="Homo sapiens",Support=100,Search both strands=true,Allow multiple copies=false,Number of motifs=Number_of_motifs,Motif width=8,Mismatches=1,Motif model="From exact pattern"} motif-prefix="Weeder"


# compare the predictions made by the different methods to the track containing the answers
Benchmark_analysis = analyze benchmark {Answer=target_sites,Aggregate=false,Site overlap=0.25}

# just setting a few colors and other options for the barchart in the output
$color(MotifSampler_predictions)=RED
$color(PRIORITY_predictions)=YELLOW
$color(AlignACE_predictions)=ORANGE
$color(BioProspector_predictions)=GREEN
$color(MDscan_predictions)=LIGHT BLUE
$color(Weeder_predictions)=VIOLET


$setting(barchart.gradientfill)=ON
$setting(barchart.border)=BLACK
$setting(barchart.barwidth)=10
$setting(barchart.bardistance)=14
$setting(barchart.bargroupspacing)=28


Output1 = output Benchmark_analysis in HTML format {Metrics="Sn,Sp,PPV,ASP,PC,Acc,CC,sSN,sPPV,sASP",X-axis="Statistic/Groups",Use abbreviations=true,Graph scale=100,Color boxes=false}