#################################################################################### # # # Benchmark Evaluation of Motif Discovery Methods # # ----------------------------------------------------- # # # # This protocol will generate 20 random DNA sequences with specified length # # based on a chosen background model and then plant up to 5 selected motifs at # # random locations in these sequences. A few de novo motif discovery methods # # will be run to predict the locations of the motifs in the sequences, and the # # performance of the methods will be evaluated. Note that in order to use this # # protocol you must have installed/configured the motif discovery methods used # # here (or rewrite the protocol to use other methods instead). # # # #################################################################################### length = new Numeric Variable(500) prompt for length "Please select a length for the 20 sequences that will be generated" background = new Background Model(Model:Uniform) prompt for background "Please select a background model for DNA sequence generation" Sequence01 = new Sequence(length) Sequence02 = new Sequence(length) Sequence03 = new Sequence(length) Sequence04 = new Sequence(length) Sequence05 = new Sequence(length) Sequence06 = new Sequence(length) Sequence07 = new Sequence(length) Sequence08 = new Sequence(length) Sequence09 = new Sequence(length) Sequence10 = new Sequence(length) Sequence11 = new Sequence(length) Sequence12 = new Sequence(length) Sequence13 = new Sequence(length) Sequence14 = new Sequence(length) Sequence15 = new Sequence(length) Sequence16 = new Sequence(length) Sequence17 = new Sequence(length) Sequence18 = new Sequence(length) Sequence19 = new Sequence(length) Sequence20 = new Sequence(length) DNA = new DNA Sequence Dataset(background) TRANSFAC_Public = new Motif Collection(Collection:TRANSFAC Public) Motifs = new Motif Collection() prompt for Motifs "Please select between 1 and 5 target motifs to plant in the sequences" Motif_sizes = new Motif Numeric Map(Property:Size) Number_of_motifs = extract "size" from Motifs as Numeric Variable max_motif_size = extract "top value in Motifs" from Motif_sizes as Numeric Variable min_motif_size = extract "bottom value in Motifs" from Motif_sizes as Numeric Variable # just setting up a few datasets that might be required by different methods uniform_priors = new Numeric Dataset(1) normalize uniform_priors sum to one !hide(uniform_priors) GCcontent = extract "GC-content" from background as Numeric Variable # plant the selected motifs in the DNA track to create the artificial benchmark dataset [DNA,target_sites] = plant Motifs in DNA {Plant probability=1.0,Force plant=true,Min match=0.75,Max match=1.0,Reverse probability=0.5} # Perform motif discovery with a few methods. # Note that you have to have these methods installed and registered with MotifLab # in order to use them. # MotifSampler [MotifSampler_predictions,Motifs_predicted_by_MotifSampler] = motifDiscovery on DNA with MotifSampler {Background=background,Width=max_motif_size,Number=Number_of_motifs,Max occurrences=0,Probability=0.5,Overlap=1,Strand=1} motif-prefix="MotifSampler" # PRIORITY [PRIORITY_predictions,Motifs_predicted_by_PRIORITY] = motifDiscovery on DNA with PRIORITY {Number of motifs=Number_of_motifs,Motif length=max_motif_size,Priors=uniform_priors,Background=background,Strand="Double",Allow 0 occurrences=true} motif-prefix="PRIORITY" # AlignAce [AlignACE_predictions,Motifs_predicted_by_AlignACE] = motifDiscovery on DNA with AlignACE {Columns=max_motif_size,Expect=Number_of_motifs,GC-background=GCcontent,Min pass=200,Undersample=1.0,Oversample=1.0} motif-prefix="AlignACE" # BioProspector [BioProspector_predictions,Motifs_predicted_by_BioProspector] = motifDiscovery on DNA with BioProspector {Motif width=max_motif_size,Top motifs to report=Number_of_motifs,Number of tries=40,Strand="Search both strands",Must occur in all="Yes",Report degenerate sites="Yes"} motif-prefix="BioProspector" # MDscan [MDscan_predictions,Motifs_predicted_by_MDscan] = motifDiscovery on DNA with MDscan {Motif width=max_motif_size,Top sequences to search=5,Top motifs to refine=30,Top motifs to report=Number_of_motifs,Iterations=10} motif-prefix="MDscan" # Weeder [Weeder_predictions,Motifs_predicted_by_Weeder] = motifDiscovery on DNA with Weeder {Analysis="medium",Organism="Homo sapiens",Support=100,Search both strands=true,Allow multiple copies=false,Number of motifs=Number_of_motifs,Motif width=8,Mismatches=1,Motif model="From exact pattern"} motif-prefix="Weeder" # compare the predictions made by the different methods to the track containing the answers Benchmark_analysis = analyze benchmark {Answer=target_sites,Aggregate=false,Site overlap=0.25} # just setting a few colors and other options for the barchart in the output $color(MotifSampler_predictions)=RED $color(PRIORITY_predictions)=YELLOW $color(AlignACE_predictions)=ORANGE $color(BioProspector_predictions)=GREEN $color(MDscan_predictions)=LIGHT BLUE $color(Weeder_predictions)=VIOLET $setting(barchart.gradientfill)=ON $setting(barchart.border)=BLACK $setting(barchart.barwidth)=10 $setting(barchart.bardistance)=14 $setting(barchart.bargroupspacing)=28 Output1 = output Benchmark_analysis in HTML format {Metrics="Sn,Sp,PPV,ASP,PC,Acc,CC,sSN,sPPV,sASP",X-axis="Statistic/Groups",Use abbreviations=true,Graph scale=100,Color boxes=false}