################################################################################# # # # This protocol will perform motif scanning and output a table showing # # the number of times each motif is found in the sequences. # # An overrepresentation p-value is calculated for each motif by comparing # # the motif's observed frequency in the sequences to its expected frequency # # based on the number of times it is found in set of artificial DNA sequences # # having the same oligonucleotide composition as the original sequences # # # ################################################################################# # Uncomment the next line (remove #-sign before 'AllSequences') to use a precompiled collection of example sequences #AllSequences = new Sequence Collection(File:"http://tare.medisin.ntnu.no/motiflab/exampledata/MuscleGenes_-2000+200.txt",format=Location) DNA = new DNA Sequence Dataset(DataTrack:DNA) Motifs = new Motif Collection(Collection:Jaspar Core) #Motifs = new Motif Collection(Collection:TRANSFAC Public) # Perform motif scanning in the DNA sequence with a cutoff threshold selected by the user (default 95%) cutoff = new Numeric Variable(95.0) prompt for cutoff "Select motif scanning cutoff threshold [0-100]" TFBS = motifScanning on DNA with SimpleScanner {Motif Collection=Motifs,Score="Absolute",Threshold type="Percentage",Threshold=cutoff} # Create a background model based on the oligonucleotide frequency composition in the DNA track # and then use this model to create a new artificial DNA track with the same frequency composition. # By performing the same motif scanning step in this random DNA track, we can calculate expected # occurrence frequencies for each motif. Note that the accuracy of this expected frequency estimation # will depend on the total length of the DNA sequences used. More/longer sequences will result in more # accurate estimations. If the total length of the DNA sequences is too short, many motifs will not # appear at all in the random DNA and these will then be assigned default expected frequencies of 0. BG_model_order = new Numeric Variable(3) prompt for BG_model_order "Select Markov Model order for background model creation [0-5]" BG_model = new Background Model(Track:DNA,Order=BG_model_order,Strand=Relative) # The next line will create a randomized/scrambled version of the original DNA track by using the BG_model to randomly select new bases to replace the original base at each position DNA_scrambled = mask DNA relative strand with BG_model $hide(DNA_scrambled) # perform motif scanning in the artificial sequences TFBS_scrambled = motifScanning on DNA_scrambled with SimpleScanner {Motif Collection=Motifs,Score="Absolute",Threshold type="Percentage",Threshold=cutoff} $hide(TFBS_scrambled) # Now estimate expected occurrence frequencies for each motif based on the number of times they occur in the artificial sequences Expected_Motif_Frequencies = new Motif Numeric Map(Track:TFBS_scrambled,property=Frequency) # Count the number of times each motif occurs in the sequences and compare the observed motif frequencies # to the expected frequencies in random DNA to calculate p-values Analysis_Motif_Occurrences = analyze count motif occurrences {Motif track=TFBS,Motifs=Motifs,Background frequencies=Expected_Motif_Frequencies,Significance threshold=0.05,Bonferroni correction="All motifs"} # Motif sequence logos are not included in the results table, since creating logo images for every motif can be quite resource consuming. # However, if you want to include sequence logos in the output, change the # value of the 'Sequence logos' parameter at the end of the next line to # either "Shared images" or "New images" (instead of "No"). Motif_Count_Table = output Analysis_Motif_Occurrences in HTML format {Sort by="p-value",Include="All motifs",Logos="No"}