# Load sequences to analyse # The specified file contains promoter regions [-2000,+200] for genes that were differentially expressed # after 2 hours in response to treatment with forskolin AllSequences = new Sequence Collection(File:"http://tare.medisin.ntnu.no/motiflab/forskolin/sequences_peak2h.txt", format=Location {Format="10 field format"}) # Obtain the DNA sequences and also a Conservation track DNA = new DNA Sequence Dataset(DataTrack:DNA) Conservation = new Numeric Dataset(DataTrack:Conservation) # Load the gene expression values for all genes and divide the sequences into two groups (upregulated/downregulated) based on these values GeneExpression = new Sequence Numeric Map(File:"http://tare.medisin.ntnu.no/motiflab/forskolin/gene_expression_2h.txt", format=MapFormat {Entry separator="Newline",Key-value separator="Tab",Include entries="All",Include default=false}) Upregulated = new Sequence Collection(Map:GeneExpression>0) Downregulated = new Sequence Collection(Map:GeneExpression<0) # The full analysis presented in the paper used 931 vertebrate motifs from TRANSFAC PRO # but here we use TRANSFAC public motifs TRANSFAC_motifs = new Motif Collection(Collection:TRANSFAC Public) # Scan for motif instances in the sequences with a 95% match cutoff BindingSites = motifScanning on DNA with SimpleScanner {Motif Collection=TRANSFAC_motifs,Score="Absolute",Threshold type="Percentage",Threshold=95} # The expected motif frequencies have been precalculated in other (larger) sequence sets for better precision Expected_motif_frequency = new Motif Numeric Map(File:"http://tare.medisin.ntnu.no/motiflab/datasets/MotifFrequencies_EPDhuman3_simplescanner95.txt", format=MapFormat {Entry separator="Newline",Key-value separator="Equals",Include entries="All",Include default=false}) # Perform analyses to find the number of times each motif occurs in the sequences, the p-value for overrepresentation # (compared to the expected frequency from the random control sequences), the average conservation level for each motif # across all its sites, and the positional distribution of the motifs (measured using kurtosis). Also see if some motifs # are differentially represented in the upregulated genes versus the downregulated Analysis1_count = analyze count motif occurrences {Motif track=BindingSites,Motifs=TRANSFAC_motifs,Background frequencies=Expected_motif_frequency,Significance threshold=0.05,Bonferroni correction="All motifs"} Analysis2_Conservation = analyze compare motif track to numeric track {Motif track=BindingSites,Motifs=TRANSFAC_motifs,Numeric track=Conservation} Analysis3_position = analyze motif position distribution {Motif track=BindingSites,Motifs=TRANSFAC_motifs,Alignment anchor="TSS",Include histograms=true,Support=true} Analysis4_Up_vs_Down = analyze compare motif occurrences {Motif track=BindingSites,Motifs=TRANSFAC_motifs,Target set=Upregulated,Control set=Downregulated,Statistical test="Binomial",Significance threshold=0.05,Bonferroni correction="All motifs"} rank_pvalue = rank ascending "p-value" from Analysis1_count rank_conservation = rank descending "average" from Analysis2_Conservation rank_kurtosis = rank descending "Kurtosis" from Analysis3_position rank_combined = rank ascending "p-value" from Analysis1_count, descending "average" from Analysis2_Conservation, descending "Kurtosis" from Analysis3_position Result = collate rank_combined as "rank", "total" from Analysis1_count, "support" from Analysis1_count, "p-value" from Analysis1_count, rank_pvalue, "average" from Analysis2_Conservation as "Conservation", rank_conservation, "Kurtosis" from Analysis3_position, rank_kurtosis, "Histogram" from Analysis3_position, "group" from Analysis4_Up_vs_Down, title="Significant motifs in sequences differentially expressed in response to treatment with forskolin" $display(Result)