"""
Interface between pybedtools and the R package VennDiagram.
Rather than depend on the user to have rpy2 installed, this simply writes an
R script that can be edited and tweaked by the user before being run in R.
"""
from __future__ import print_function
import os
import string
import pybedtools
import six
from pybedtools import helpers
import subprocess
from collections import OrderedDict
# really just fill in x and filename...leave the rest up to the user.
#
# Note that the closing parentheses is missing -- that's so the user can add
# kwargs from the calling function
template = string.Template("""
library(VennDiagram)
venn.diagram(
x=$x,
filename=$filename,
category.names = $names
""")
def _list_to_R_syntax(x):
"""
Convert items in `x` to a string, and replace tabs with pipes in Interval
string representations. Put everything into an R vector and return as one
big string.
"""
items = []
for i in x:
if isinstance(i, pybedtools.Interval):
i = str(i).replace('\t', '|')
items.append('"%s"' % i)
return 'c(%s)' % ','.join(items)
def _dict_to_R_named_list(d):
"""
Calls _list_to_R_syntax for each item. Returns one big string.
"""
items = []
for key, val in list(d.items()):
items.append('"%s" = %s' % (key, _list_to_R_syntax(val)))
return 'list(%s)' % ', '.join(items)
def truncator(feature):
"""
Convert a feature of any format into a BED3 format.
"""
return pybedtools.create_interval_from_list(
[feature.chrom, str(feature.start), str(feature.stop)])
[docs]def cleaned_intersect(items):
"""
Perform interval intersections such that the end products have identical \
features for overlapping intervals.
The VennDiagram package does *set* intersection, not *interval*
intersection. So the goal here is to represent intersecting intervals as
intersecting sets of strings.
Doing a simple BEDTools intersectBed call doesn't do the trick (even with
the -u argument). As a concrete example, what would the string be for an
intersection of the feature "chr1:1-100" in file `x` and "chr1:50-200" in
file `y`?
The method used here is to substitute the intervals in `y` that overlap `x`
with the corresponding elements in `x`. This means that in the resulting
sets, the overlapping features are identical. To follow up with the
example, both `x` and `y` would have an item "chr1:50-200" in their sets,
simply indicating *that* one interval overlapped.
Venn diagrams are not well suited for nested overlaps or multi-overlaps.
To illustrate, try drawing the 2-way Venn diagram of the following two
files. Specifically, what number goes in the middle -- the number of
features in `x` that intersect `y` (1) or the number of features in `y`
that intersect `x` (2)?::
x:
chr1 1 100
chr1 500 6000
y:
chr1 50 100
chr1 80 200
chr9 777 888
In this case, this function will return the following sets::
x:
chr1:1-100
chr1:500-6000
y:
chr1:1-100
chr9:777-888
This means that while `x` does not change in length, `y` can. For example,
if there are 2 features in `x` that overlap one feature in `y`, then `y`
will gain those two features in place of its single original feature.
This strategy is extended for multiple intersections -- see the source for
details.
"""
if len(items) == 2:
x = items[0].each(truncator).saveas()
y = items[1].each(truncator).saveas()
# Combine the unique-to-y intervals with the shared-with-x intervals.
# Since x is first in x+y, resulting features are from x.
new_y = (y - x).cat(x + y)
return x, new_y
if len(items) == 3:
x = items[0].each(truncator).saveas()
y = items[1].each(truncator).saveas()
z = items[2].each(truncator).saveas()
# Same as above. Don't care about z yet; this means that y will not
# change because of z.
new_y = (y - x).cat(x + y)
# Combine:
# unique-to-z
# shared-with-any-x
# shared-with-unique-to-y
new_z = (z - y - x).cat(x + z).cat((y - x) + z)
return x, new_y, new_z
if len(items) == 4:
x = items[0].each(truncator).saveas()
y = items[1].each(truncator).saveas()
z = items[2].each(truncator).saveas()
q = items[3].each(truncator).saveas()
# Same as 2-way
new_y = (y - x).cat(x + y)
# Same as 3-way
new_z = (z - y - x).cat(x + z).cat((y - x) + z)
# Combine:
# unique-to-q
# shared-with-any-x
# shared-with-unique-to-y
# shared-with-unique-to-z
new_q = (q - z - y - x)\
.cat(x + q)\
.cat((y - x) + q)\
.cat((z - y - x) + q)
return x, new_y, new_z, new_q
[docs]def venn_maker(beds, names=None, figure_filename=None, script_filename=None,
additional_args=None, run=False):
"""
Given a list of interval files, write an R script to create a Venn \
diagram of overlaps (and optionally run it).
The R script calls the venn.diagram function of the R package VennDiagram
for extremely flexible Venn and Euler diagram creation. Uses
`cleaned_intersect()` to create string representations of shared intervals.
`beds` is a list of up to 4 filenames or BedTools.
`names` is a list of names to use for the Venn diagram, in the same order
as `beds`. Default is "abcd"[:len(beds)].
`figure_filename` is the TIFF file to save the figure as.
`script_filename` is the optional filename to write the R script to
`additional_args` is list that will be inserted into the R script,
verbatim. For example, to use scaled Euler diagrams with different colors,
use::
additional_args = ['euler.d=TRUE',
'scaled=TRUE',
'cat.col=c("red","blue")']
If `run` is True, then assume R is installed, is on the path, and has
VennDiagram installed . . . and run the script. The resulting filename
will be saved as `figure_filename`.
"""
if figure_filename is None:
figure_filename = 'NULL'
else:
figure_filename = '"%s"' % figure_filename
if names is None:
names = "abcd"[:len(beds)]
_beds = []
for bed in beds:
if not isinstance(bed, pybedtools.BedTool):
bed = pybedtools.BedTool(bed)
_beds.append(bed)
cleaned = cleaned_intersect(_beds)
results = OrderedDict(list(zip(names, cleaned)))
s = template.substitute(
x=_dict_to_R_named_list(results),
filename=figure_filename,
names=_list_to_R_syntax(names))
if additional_args:
s += ',' + ', '.join(additional_args)
s += ")"
if not script_filename:
fn = pybedtools.BedTool._tmp()
else:
fn = script_filename
fout = open(fn, 'w')
fout.write(s)
fout.close()
out = fn + '.Rout'
if run:
if not pybedtools.settings._R_installed:
helpers._check_for_R()
cmds = [os.path.join(pybedtools.settings._R_path, 'R'), 'CMD', 'BATCH',
fn, out]
p = subprocess.Popen(
cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = p.communicate()
if stdout or stderr:
print("stdout:", stdout)
print("stderr:", stderr)
if not script_filename:
return s
return None