################################################################################### # # # Filter Predicted Binding Sites # # ------------------------------------ # # # # This protocol performs motif scanning in a set of sequences with motifs from # # TRANSFAC and then proceeds to filter out predictions according to different # # criteria, such as binding sites that are not (very) conserved, binding sites # # overlapping with known repeat regions, binding sites that are not located # # within a DNase hypersensitivity site, binding sites that are not supported by # # a ChIP-seq peak region for the corresponding TF or binding sites that do not # # have sites for known interaction partners within a specified distance. # # # ################################################################################### # Uncomment the next line (remove #-sign before 'AllSequences') to use a precompiled collection of example sequences # AllSequences = new Sequence Collection(File:"http://tare.medisin.ntnu.no/motiflab/exampledata/MuscleGenes_-2000+200.txt",format=Location) DNA = new DNA Sequence Dataset(DataTrack:DNA) TRANSFAC_Public = new Motif Collection(Collection:TRANSFAC Public) BindingSites = motifScanning on DNA with SimpleScanner {Motif Collection=TRANSFAC_Public,Threshold type="Percentage",Threshold=95,Score="Absolute"} Conservation = new Numeric Dataset(DataTrack:Conservation) RepeatMasker = new Region Dataset(DataTrack:RepeatMasker) DNaseHS_peaks = new Region Dataset(DataTrack:DNaseHS_peaks) $expand(DNaseHS_peaks) # Filtering non-conserved sites (or sites with low average conservation. The "0.2" threshold used here is arbitrary) TFBS_conserved = filter BindingSites where region's average Conservation < 0.2 # Filter binding sites that do not overlap with a DNaseHS peak region in any cell types (at least not the ones in this track) TFBS_within_DNaseHS_peak = filter BindingSites where not region overlaps DNaseHS_peaks # If you want to consider DNaseHS sites for just a single cell type, you can create a new DNaseHS track # that only contains regions for your selected cell type using the operations below (shown here for K562). # # DNaseHS_peaks_K562 = filter DNaseHS_peaks where not region's type equals "K562" # TFBS_within_DNaseHS_peak_K562 = filter BindingSites where not region overlaps DNaseHS_peaks_K562 # To see which cell types have DNaseHS regions in the sequences, you can use the following analysis: # # DNAseHS_celltypes = analyze count region occurrences {Region track=DNaseHS_peaks} # Filter TFBS that reside within known repeat regions TFBS_outside_repeats = filter BindingSites where region overlaps RepeatMasker # The following track contains CHIP-seq peak regions for several TFs and is baseed on multiple cell types TFBS_ChIP_Seq = new Region Dataset(DataTrack:TFBS_ChIP-Seq) # The regions in the TFBS_ChIP_Seq track have type names based on the binding TF (e.g. "STAT1", "p300" etc.), # so we need some way to map these TF names to the corresponding motif IDs which are used in the BindingSites track. # This is not straightforward since there can be a a many-to-many relationship between TFs and (Transfac) motifs. # The following two "map" objects (represented here as general Text Variable objects since we have no actual Map type for this purpose) # specify an association between TFs and (possibly multiple) motifs as "key=>value" pairs. # The "lenient" map contains a few more motif models for each TF than the strict map, and can also contain motifs for # related TFs with similar binding motifs ChIPSeqMap_Strict = new Text Variable("CEBPB=>M00109 M00117","c-Fos=>M00517 M00924 M00926 M00925 M00172","c-Jun=>M00517 M00924 M00926 M00925 M00172 M00041","c-Myc=>M01145 M01154 M01034 M00799 M00118 M00123 M00615 M00322","CTCF=>M01196 M01200 M01259","EBF=>M00977","Egr-1=>M00243 M00807 M00982","ERRA=>M00511","FOSL2=>M00924 M00926 M00925","GABP=>M00108 M01258 M01660 M00341 M00971","GR=>M00205 M00192 M00921 M00955=>","HNF4A=>M00638 M00967 M01033 M01032 M01031 M00764 M00134 M00411 M00762 M00158","HSF1=>M01023 M00146 M00641","IRF4=>M00772 M00972","JunD=>M00517 M00924 M00925 M00926","Max=>M00118 M00119 M00123 M00322 M00615 M00799 M01034","NF-E2=>M00037 M00983","NFKB=>M00051 M00052 M00053 M00054 M00194 M00208 M00774 M01223 M01224 M01239","Nrf1=>M00284 M00285 M00652 M00983","NRSF=>M00256 M00325 M01028 M01256","p300=>M00033","PAX5-C20=>M00143 M00144 M00808","PAX5-N19=>M00143 M00144 M00808","Pbx3=>M00998","POU2F2=>M00210 M00795 M01368","PU.1=>M00658 M00971 M01172 M01203 M01204","RXRA=>M00242 M00444 M00512 M00515 M00518 M00631 M00647 M00762 M00763 M00766 M00767 M00963 M00964 M00965 M00966 M01152 M01153 M01198 M01202 M01268","SP1=>M00008 M00196 M00255 M00931 M00932 M00933 M01219 M01303","SREBP1=>M00220 M00221 M00749 M00776 M01168 M01173","SREBP2=>M00776 M01168 M01177","SRF=>M00152 M00186 M00215 M00810 M00922 M01007 M01257 M01304","STAT1=>M00223 M00224 M00492 M00496 M00777 M01212 M01260","STAT2=>M00223 M00777","TAF1=>M00216 M00252 M00311 M00320 M00471 M00980","TCF12=>M00698","USF-1=>M00121 M00122 M00187 M00217 M00726 M00796 M01034","ZBTB33=>M01119") ChIPSeqMap_Lenient = new Text Variable("BATF=>M00017 M00040 M00041 M00179 M00338 M00483 M00513 M00514 M00691 M00801 M00981 M01295","BHLHE40=>M00066 M00222 M00263 M00434 M00435 M00440 M00948 M00952 M00973 M01050 M01054 M01116 M01249 M01523 M01585 M01621 M00804 M00929 M00303 M01577 M01699 M01727 M00002 M00065 M00070 M00071 M00693 M00698 M00001 M00184 M00712 M01302 M00993 M01591 M01103 M00058 M00068 M01287 M01288 M01716 M00067 M00985 M00997 M01009 M00064 M01564 M00139 M00235 M00236 M00237 M00466 M00539 M00778 M00797 M00976 M01671 M01689 M01229 M01240 M01241 M00261 M00977 M00446 M01558 M01580 M01029 M00121 M00122 M00187 M00217 M00726 M00796 M00220 M00221 M00749 M00776 M01168 M01173 M01177 M00005 M00175 M00176 M00927 M00118 M00123 M00322 M00615 M00799 M00055 M01145 M01154 M00119","CEBPB=>M00159 M00201 M00912 M00190 M00770 M00116 M00109 M00117 M00621 M00622 M00249","c-Fos=>M00517 M00199 M00924 M00173 M00926 M00188 M00925 M00174 M00172 M00041","c-Jun=>M00517 M00199 M00924 M00173 M00926 M00188 M00925 M00174 M00172 M00041","c-Myc=>M01145 M01154 M01034 M00799 M00118 M00123 M00615 M00322","CTCF=>M01196 M01200 M01259","EBF=>M00467 M00977","Egr-1=>M00243 M00245 M00246 M00807 M00982","ERRA=>M00959 M00191 M00511 M01589","FOSL2=>M00517 M00199 M00924 M00173 M00926 M00188 M00925 M00174 M00172","FOXP2=>M00268 M00267 M00332 M00724 M01261 M00791 M01012 M00294 M00742 M00289 M00129 M00293 M00292 M00291 M00290 M00992 M01599 M00987 M00476 M00472 M01137 M00477 M01216 M00474 M00473 M00630 M00423 M00422 M00130 M00809","GABP=>M00108 M01258 M01660 M00341 M00971","GR=>M00205 M00192 M00921 M00955","HEY1=>M00261 M00977 M01229 M01240 M01241 M01671 M01689 M00139 M00235 M00236 M00237 M00466 M00539 M00778 M00797 M00976 M00064 M01564 M00067 M00985 M00997 M01009 M01287 M01288 M01716 M00058 M00068 M01103 M00993 M01591 M00001 M00184 M00712 M01302 M00002 M00065 M00070 M00071 M00693 M00698 M00303 M01577 M01699 M01727 M00804 M00929 M00066 M00222 M00263 M00434 M00435 M00440 M00948 M00952 M00973 M01050 M01054 M01116 M01249","HNF4A=>M00638 M00967 M01033 M01032 M01031 M00764 M00134 M00411 M00762 M00158","HSF1=>M01244 M00147 M01023 M00146 M00641","IRF4=>M00062 M00063 M00453 M00699 M00747 M00772 M00972 M01279 M01452 M01665","JunD=>M00036 M00041 M00172 M00173 M00174 M00188 M00199 M00517 M00924 M00925 M00926","Max=>M00118 M00119 M00123 M00322 M00615 M00799 M01034","NF-E2=>M00037 M00983","NFKB=>M00051 M00052 M00053 M00054 M00194 M00208 M00774 M01223 M01224 M01239","Nrf1=>M00284 M00285 M00652 M00983","NRSF=>M00256 M00325 M01028 M01256","p300=>M00033","PAX5-C20=>M00097 M00098 M00143 M00144 M00326 M00327 M00328 M00329 M00360 M00373 M00377 M00378 M00380 M00486 M00717 M00808 M00979 M01339 M01385 M01391","PAX5-N19=>M00097 M00098 M00143 M00144 M00326 M00327 M00328 M00329 M00360 M00373 M00377 M00378 M00380 M00486 M00717 M00808 M00979 M01339 M01385 M01391","Pbx3=>M00096 M00124 M00998 M01017 M01357","PGC1A=>M00242 M00512 M00515 M00518 M00528 M00762 M00763 M01270 M01282","POU2F2=>M00133 M00135 M00136 M00137 M00138 M00145 M00161 M00162 M00195 M00210 M00248 M00342 M00463 M00464 M00465 M00662 M00689 M00744 M00795 M00802 M00930 M01098 M01124 M01125 M01307 M01316 M01324 M01354 M01368 M01408 M01462 M01465 M01473 M01476 M01477 M01479","PU.1=>M00007 M00016 M00025 M00032 M00074 M00108 M00339 M00340 M00341 M00531 M00655 M00658 M00678 M00743 M00746 M00771 M00971 M01078 M01163 M01165 M01167 M01172 M01197 M01203 M01204 M01207 M01208 M01214 M01258 M01266","RXRA=>M00242 M00444 M00512 M00515 M00518 M00631 M00647 M00762 M00763 M00766 M00767 M00963 M00964 M00965 M00966 M01152 M01153 M01198 M01202 M01268","SIX5=>M01313 M01345 M01358 M01374 M01398 M01433","SP1=>M00008 M00196 M00255 M00931 M00932 M00933 M01219 M01303","SREBP1=>M00220 M00221 M00749 M00776 M01168 M01173 M01177","SREBP2=>M00220 M00221 M00749 M00776 M01168 M01173 M01177","SRF=>M00026 M00152 M00186 M00215 M00403 M00405 M00406 M00407 M00810 M00922 M00941 M01007 M01257 M01304","STAT1=>M00223 M00224 M00225 M00259 M00457 M00459 M00460 M00492 M00493 M00494 M00496 M00497 M00498 M00499 M00500 M00777 M01212 M01220 M01260 M01595 M01666","STAT2=>M00223 M00224 M00225 M00259 M00457 M00459 M00460 M00492 M00493 M00494 M00496 M00497 M00498 M00499 M00500 M00777 M01212 M01220 M01260 M01595 M01666","TAF1=>M00216 M00252 M00311 M00320 M00471 M00980","TCF12=>M00002 M00065 M00066 M00071 M00222 M00693 M00804 M00929 M00973 M01034 M01594 M01716 M00698","USF-1=>M00121 M00122 M00187 M00217 M00726 M00796 M01034","ZBTB33=>M01119") # The following transforms will check the type of each region in the TFBS_ChIP_Seq track, # and if the type matches any of the "keys" specified in the maps (Text Variables), # the region type will be replaced with the corresponding "value" from the map. TFBS_ChIP_Seq_Lenient = transform TFBS_ChIP_Seq with type-replace(ChIPSeqMap_Lenient) TFBS_ChIP_Seq_Strict = transform TFBS_ChIP_Seq with type-replace(ChIPSeqMap_Strict) # Now filter out predicted binding sites that do not overlap with a ChIP-seq region which has the motif ID # of the TFBS listed as part of their region type. TFBS_supported_by_ChIPseq_lenient = filter BindingSites where not region overlaps type-matching TFBS_ChIP_Seq_Lenient TFBS_supported_by_ChIPseq_strict = filter BindingSites where not region overlaps type-matching TFBS_ChIP_Seq_Strict # Filter binding sites that do not have other sites for potential known interaction partners nearby # The settings used here specify that the other site must be located between 0 and 16 bp away. # This means that overlapping sites are not considered. This will avoid "problems" with overlapping sites # for the same TF represented by different motif models, since many TFs interact with themselves # (or at least other TFs of the same type). TFBS_interacting = filter BindingSites where not region's distance to any interaction partner in 0 to 16