Session 23 – Biopython: GenomeDiagram
23.1 Linear/Circular Gene Diagrams
First get a plasmid genbank file or even a mitochondrial dna genbank file. Feel free to use the example from this link.
Notice: macOSX migth need to install reportlab and use pip3 install reportlab.
# import modules
import os
from reportlab.lib import colors
from reportlab.lib.units import cm
from Bio.Graphics import GenomeDiagram
from Bio import SeqIO
# set wd
os.chdir("YOUR WORKING DIRECTORY WITH GENBANK FILE HERE")
# load file as record
record = SeqIO.read("GENBANK FILE HERE", "genbank")
# set parameters, set empty diagram
gen_diagram = GenomeDiagram.Diagram("TITLE HERE")
gen_track_for_features = gen_diagram.new_track(1, name="Annotated Features")
gen_feature_set = gen_track_for_features.new_set()
# Generate diagram, alternate colors
for feature in record.features:
if feature.type != "gene":
# Do Nothing
continue
if len(gen_feature_set) % 2 == 0:
color = colors.blue
else:
color = colors.lightblue
gen_feature_set.add_feature(feature, color=color, label=True, label_size=10)
# Create output
gen_diagram.draw(
format="linear",
orientation="landscape",
pagesize="A4",
fragments=4,
start=0,
end=len(record),
)
# Save figure as ...
gen_diagram.write("example1.pdf", "PDF")
gen_diagram.write("example1.eps", "EPS")
gen_diagram.write("example1.svg", "SVG")
gen_diagram.write("example1.png", "PNG")
# Circular figure:
gen_diagram.draw(
format="circular",
circular=True,
pagesize=(20 * cm, 20 * cm),
start=0,
end=len(record),
circle_core=0.7,
)
gen_diagram.write("example1.pdf", "PDF")23.2 Diagram Readability
You can make several changes to the above code to improve readability of the diagram.
# edit the code above with the code below and see what changes
gen_feature_set.add_feature(
feature,
label=True,
color="purple",
label_position="end",
label_size=20,
label_angle=90,
)23.3 Box Shape
You can change the default shape of the gene boxes.
# Default
gen_feature_set.add_feature(feature)
# Box
gen_feature_set.add_feature(feature, sigil="BOX")
# Arrow
gen_feature_set.add_feature(feature, sigil="ARROW")
# Big Arrow
gen_feature_set.add_feature(feature, sigil="BIGARROW")
# Octo-gon
gen_feature_set.add_feature(feature, sigil="OCTO")
# Potter Zag
gen_feature_set.add_feature(feature, sigil="JAGGY")23.4 Edit Box Shape: Arrows
You can edit the shape of the new shape.
# Tail-less Arrow
gen_feature_set.add_feature(feature, sigil="ARROW", color="brown", arrowshaft_height=1.0)
# Thin Arrow
gen_feature_set.add_feature(feature, sigil="ARROW", color="teal", arrowshaft_height=0.2)
# Very Thin Arrow
gen_feature_set.add_feature(
feature, sigil="ARROW", color="darkgreen", arrowshaft_height=0.1
)
# Short Arrowhead
gen_feature_set.add_feature(feature, sigil="ARROW", color="blue", arrowhead_length=0.25)
# Long Arrowhead
gen_feature_set.add_feature(feature, sigil="ARROW", color="orange", arrowhead_length=1)
# All Arrowhead
gen_feature_set.add_feature(feature, sigil="ARROW", color="red", arrowhead_length=10000)
# Big Arrow
gen_feature_set.add_feature(feature, sigil="BIGARROW")23.5 Restriction Map
Add restriction sites to your small genome
# old
from reportlab.lib import colors
from reportlab.lib.units import cm
from Bio.Graphics import GenomeDiagram
from Bio import SeqIO
# new
from Bio.SeqFeature import SeqFeature, FeatureLocation
# old
record = SeqIO.read("YOUR GB FILE HERE", "genbank")
# old
gd_diagram = GenomeDiagram.Diagram(record.id)
gd_track_for_features = gd_diagram.new_track(1, name="Annotated Features")
gd_feature_set = gd_track_for_features.new_set()
# old
for feature in record.features:
if feature.type != "gene":
# Exclude this feature
continue
if len(gd_feature_set) % 2 == 0:
color = colors.blue
else:
color = colors.lightblue
gd_feature_set.add_feature(
feature, sigil="ARROW", color=color, label=True, label_size=14, label_angle=0
)
# new: Add restriction enzymes and sites
for site, name, color in [
("GAATTC", "EcoRI", colors.green),
("CCCGGG", "SmaI", colors.orange),
("AAGCTT", "HindIII", colors.red),
("GGATCC", "BamHI", colors.purple),
]:
index = 0
while True:
index = record.seq.find(site, start=index)
if index == -1:
break
feature = SeqFeature(FeatureLocation(index, index + len(site)))
gd_feature_set.add_feature(
feature,
color=color,
name=name,
label=True,
label_size=10,
label_color=color,
)
index += len(site)
# old, linear
gd_diagram.draw(format="linear", pagesize="A4", fragments=4, start=0, end=len(record))
gd_diagram.write("example2.pdf", "PDF")
gd_diagram.write("example2.eps", "EPS")
gd_diagram.write("example2.svg", "SVG")
# old, circular
gd_diagram.draw(
format="circular",
circular=True,
pagesize=(20 * cm, 20 * cm),
start=0,
end=len(record),
circle_core=0.5,
)
gd_diagram.write("example2.pdf", "PDF")
gd_diagram.write("example2.eps", "EPS")
gd_diagram.write("example2.svg", "SVG")23.6 Tracks on Tracks
I will use the following genbank file for my example of stacked tracks along with my previous file. You are again free to use your own genome/genome segment of choice.
from Bio import SeqIO
A_rec = SeqIO.read("XL.gb", "gb")
B_rec = SeqIO.read("XT.gb", "gb")
C_rec = SeqIO.read("NP.gb", "gb")
from reportlab.lib.colors import (
red,
grey,
orange,
green,
brown,
blue,
lightblue,
purple,
)
A_colors = (
[red] * 5
+ [grey] * 7
+ [orange] * 2
+ [grey] * 2
+ [orange]
+ [grey] * 11
+ [green] * 4
+ [grey]
+ [green] * 2
+ [grey, green]
+ [brown] * 5
+ [blue] * 4
+ [lightblue] * 5
+ [grey, lightblue]
+ [purple] * 2
+ [grey]
)
B_colors = (
[red] * 6
+ [grey] * 8
+ [orange] * 2
+ [grey]
+ [orange]
+ [grey] * 21
+ [green] * 5
+ [grey]
+ [brown] * 4
+ [blue] * 3
+ [lightblue] * 3
+ [grey] * 5
+ [purple] * 2
)
C_colors = (
[grey] * 30
+ [green] * 5
+ [brown] * 4
+ [blue] * 2
+ [grey, blue]
+ [lightblue] * 2
+ [grey] * 5
)
from Bio.Graphics import GenomeDiagram
name = "Stacked"
gd_diagram = GenomeDiagram.Diagram(name)
max_len = 0
for record, gene_colors in zip([A_rec, B_rec, C_rec], [A_colors, B_colors, C_colors]):
max_len = max(max_len, len(record))
gd_track_for_features = gd_diagram.new_track(
1, name=record.name, greytrack=True, start=0, end=len(record)
)
gd_feature_set = gd_track_for_features.new_set()
i = 0
for feature in record.features:
if feature.type != "gene":
# Exclude this feature
continue
gd_feature_set.add_feature(
feature,
sigil="ARROW",
color=gene_colors[i],
label=True,
name=str(i + 1),
label_position="start",
label_size=6,
label_angle=0,
)
i += 1
gd_diagram.draw(format="linear", pagesize="A4", fragments=1, start=0, end=max_len)
gd_diagram.write(name + ".pdf", "PDF")
gd_diagram.write(name + ".eps", "EPS")
gd_diagram.write(name + ".svg", "SVG")23.7 Cross-link Data
Time to find similarities and differences.
A_vs_B = [
(99, "ND1", "ND1"),
(33, "ND2", "ND2"),
(94, "COX1", "COX1"),
(100, "COX2", "COX2"),
(97, "ATP8", "ATP8"),
(98, "ATP6", "ATP6"),
(98, "COX3", "COX3"),
(100, "ND3", "ND3"),
(100, "ND4L", "ND4L"),
(94, "ND4", "ND4"),
(87, "ND5", "ND5"),
(94, "ND6", "ND6"),
(94, "CYTB", "CYTB"),
]
B_vs_C = [
(49, "ND1", "ND1"),
(53, "ND2", "ND2"),
(64, "COX1", "COX1"),
(30, "COX2", "COX2"),
(27, "ATP8", "ATP8"),
(88, "ATP6", "ATP6"),
(68, "COX3", "COX3"),
(30, "ND3", "ND3"),
(100, "ND4L", "ND4L"),
(74, "ND4", "ND4"),
(37, "ND5", "ND5"),
(74, "ND6", "ND6"),
(64, "CYTB", "CYTB"),
]
def get_feature(features, id, tags=["locus_tag", "gene"]):
"""Search list of SeqFeature objects for an identifier under the given tags."""
for f in features:
for key in tags:
# tag may not be present in this feature
for x in f.qualifiers.get(key, []):
if x == id:
return f
raise KeyError(id)
from Bio.Graphics.GenomeDiagram import CrossLink
from reportlab.lib import colors
for rec_X, tn_X, rec_Y, tn_Y, X_vs_Y in [
(A_rec, 3, B_rec, 2, A_vs_B),
(B_rec, 2, C_rec, 1, B_vs_C),
]:
track_X = gd_diagram.tracks[tn_X]
track_Y = gd_diagram.tracks[tn_Y]
for score, id_X, id_Y in X_vs_Y:
feature_X = get_feature(rec_X.features, id_X)
feature_Y = get_feature(rec_Y.features, id_Y)
color = colors.linearlyInterpolatedColor(
colors.white, colors.firebrick, 0, 100, score
)
link_xy = CrossLink(
(track_X, feature_X.location.start, feature_X.location.end),
(track_Y, feature_Y.location.start, feature_Y.location.end),
color,
colors.lightgrey,
)
gd_diagram.cross_track_links.append(link_xy)
gd_diagram.draw(format="linear", pagesize="A4", fragments=1, start=0, end=max_len)
gd_diagram.write(name + ".pdf", "PDF")
gd_diagram.write(name + ".eps", "EPS")
gd_diagram.write(name + ".svg", "SVG")23.8 Clean Cross-Link
from reportlab.lib import colors
from reportlab.lib.colors import red, grey, orange, green, brown
from reportlab.lib.colors import blue, lightblue, purple
from Bio.Graphics import GenomeDiagram
from Bio.Graphics.GenomeDiagram import CrossLink
from Bio import SeqIO
from Bio.SeqFeature import SeqFeature, FeatureLocation
name = "Example3"
A_rec = SeqIO.read("XL.gb", "gb")
B_rec = SeqIO.read("XT.gb", "gb")
C_rec = SeqIO.read("NP.gb", "gb")
records = {rec.name: rec for rec in [A_rec, B_rec, C_rec]}
A_colors = (
[red]
+ [grey]
+ [orange]
+ [grey]
+ [orange]
+ [grey]
+ [green]
+ [grey]
+ [green]
+ [grey, green]
+ [brown]
+ [blue]
+ [lightblue]
+ [grey, lightblue]
+ [purple]
+ [grey]
)
B_colors = (
[red]
+ [grey]
+ [orange]
+ [grey]
+ [orange]
+ [grey]
+ [green]
+ [grey]
+ [brown]
+ [blue]
+ [lightblue]
+ [grey]
+ [purple]
)
C_colors = (
[grey]
+ [green]
+ [brown]
+ [blue]
+ [grey, blue]
+ [lightblue]
+ [grey] * 8
)
A_vs_B = [
(99, "ND1", "ND1"),
(33, "ND2", "ND2"),
(94, "COX1", "COX1"),
(100, "COX2", "COX2"),
(97, "ATP8", "ATP8"),
(98, "ATP6", "ATP6"),
(98, "COX3", "COX3"),
(100, "ND3", "ND3"),
(100, "ND4L", "ND4L"),
(94, "ND4", "ND4"),
(87, "ND5", "ND5"),
(94, "ND6", "ND6"),
(94, "CYTB", "CYTB"),
]
B_vs_C = [
(49, "ND1", "ND1"),
(53, "ND2", "ND2"),
(64, "COX1", "COX1"),
(30, "COX2", "COX2"),
(27, "ATP8", "ATP8"),
(88, "ATP6", "ATP6"),
(68, "COX3", "COX3"),
(30, "ND3", "ND3"),
(100, "ND4L", "ND4L"),
(74, "ND4", "ND4"),
(37, "ND5", "ND5"),
(74, "ND6", "ND6"),
(64, "CYTB", "CYTB"),
]
def get_feature(features, id, tags=("locus_tag", "gene", "old_locus_tag")):
"""Search list of SeqFeature objects for an identifier under the given tags."""
for f in features:
for key in tags:
# tag may not be present in this feature
for x in f.qualifiers.get(key, []):
if x == id:
return f
raise KeyError(id)
gd_diagram = GenomeDiagram.Diagram(name)
feature_sets = {}
max_len = 0
for i, record in enumerate([A_rec, B_rec, C_rec]):
max_len = max(max_len, len(record))
# Allocate tracks 5 (top), 3, 1 (bottom) for A, B, C
# (empty tracks 2 and 4 add useful white space to emphasise the cross links
# and also serve to make the tracks vertically more compressed)
gd_track_for_features = gd_diagram.new_track(
5 - 2 * i,
name=record.name,
greytrack=True,
height=0.5,
start=0,
end=len(record),
)
assert record.name not in feature_sets
feature_sets[record.name] = gd_track_for_features.new_set()
for X, Y, X_vs_Y in [
("NC_001573", "NC_006839", A_vs_B),
("NC_006839", "NC_026789", B_vs_C),
]:
features_X = records[X].features
features_Y = records[Y].features
set_X = feature_sets[X]
set_Y = feature_sets[Y]
for score, x, y in X_vs_Y:
color = colors.linearlyInterpolatedColor(
colors.white, colors.firebrick, 0, 100, score
)
border = colors.lightgrey
f_x = get_feature(features_X, x)
F_x = set_X.add_feature(
SeqFeature(FeatureLocation(f_x.location.start, f_x.location.end, strand=0)),
color=color,
border=border,
)
f_y = get_feature(features_Y, y)
F_y = set_Y.add_feature(
SeqFeature(FeatureLocation(f_y.location.start, f_y.location.end, strand=0)),
color=color,
border=border,
)
gd_diagram.cross_track_links.append(CrossLink(F_x, F_y, color, border))
for record, gene_colors in zip([A_rec, B_rec, C_rec], [A_colors, B_colors, C_colors]):
gd_feature_set = feature_sets[record.name]
i = 0
for feature in record.features:
if feature.type != "gene":
# Exclude this feature
continue
try:
g_color = gene_colors[i]
except IndexError:
print("Don't have color for %s gene %i" % (record.name, i))
g_color = grey
gd_feature_set.add_feature(
feature,
sigil="BIGARROW",
color=g_color,
label=True,
name=str(i + 1),
label_position="start",
label_size=6,
label_angle=0,
)
i += 1
gd_diagram.draw(format="linear", pagesize="A4", fragments=1, start=0, end=max_len)
gd_diagram.write(name + ".pdf", "PDF")
gd_diagram.write(name + ".eps", "EPS")
gd_diagram.write(name + ".svg", "SVG")