Session 23 – Biopython: GenomeDiagram
23.1 Linear/Circular Gene Diagrams
First get a plasmid genbank file or even a mitochondrial dna genbank file. Feel free to use the example from this link.
Notice: macOSX migth need to install reportlab and use pip3 install reportlab
.
# import modules
import os
from reportlab.lib import colors
from reportlab.lib.units import cm
from Bio.Graphics import GenomeDiagram
from Bio import SeqIO
# set wd
os.chdir("YOUR WORKING DIRECTORY WITH GENBANK FILE HERE")
# load file as record
= SeqIO.read("GENBANK FILE HERE", "genbank")
record
# set parameters, set empty diagram
= GenomeDiagram.Diagram("TITLE HERE")
gen_diagram = gen_diagram.new_track(1, name="Annotated Features")
gen_track_for_features = gen_track_for_features.new_set()
gen_feature_set
# Generate diagram, alternate colors
for feature in record.features:
if feature.type != "gene":
# Do Nothing
continueif len(gen_feature_set) % 2 == 0:
= colors.blue
color else:
= colors.lightblue
color gen_feature_set.add_feature(feature, color=color, label=True, label_size=10)
# Create output
gen_diagram.draw(
format="linear",
orientation="landscape",
pagesize="A4",
fragments=4,
start=0,
end=len(record),
)
# Save figure as ...
gen_diagram.write("example1.pdf", "PDF")
gen_diagram.write("example1.eps", "EPS")
gen_diagram.write("example1.svg", "SVG")
gen_diagram.write("example1.png", "PNG")
# Circular figure:
gen_diagram.draw(
format="circular",
circular=True,
pagesize=(20 * cm, 20 * cm),
start=0,
end=len(record),
circle_core=0.7,
)gen_diagram.write("example1.pdf", "PDF")
23.2 Diagram Readability
You can make several changes to the above code to improve readability of the diagram.
# edit the code above with the code below and see what changes
gen_feature_set.add_feature(
feature,label=True,
color="purple",
label_position="end",
label_size=20,
label_angle=90,
)
23.3 Box Shape
You can change the default shape of the gene boxes.
# Default
gen_feature_set.add_feature(feature)
# Box
gen_feature_set.add_feature(feature, sigil="BOX")
# Arrow
gen_feature_set.add_feature(feature, sigil="ARROW")
# Big Arrow
gen_feature_set.add_feature(feature, sigil="BIGARROW")
# Octo-gon
gen_feature_set.add_feature(feature, sigil="OCTO")
# Potter Zag
gen_feature_set.add_feature(feature, sigil="JAGGY")
23.4 Edit Box Shape: Arrows
You can edit the shape of the new shape.
# Tail-less Arrow
gen_feature_set.add_feature(feature, sigil="ARROW", color="brown", arrowshaft_height=1.0)
# Thin Arrow
gen_feature_set.add_feature(feature, sigil="ARROW", color="teal", arrowshaft_height=0.2)
# Very Thin Arrow
gen_feature_set.add_feature(
sigil="ARROW", color="darkgreen", arrowshaft_height=0.1
feature,
)
# Short Arrowhead
gen_feature_set.add_feature(feature, sigil="ARROW", color="blue", arrowhead_length=0.25)
# Long Arrowhead
gen_feature_set.add_feature(feature, sigil="ARROW", color="orange", arrowhead_length=1)
# All Arrowhead
gen_feature_set.add_feature(feature, sigil="ARROW", color="red", arrowhead_length=10000)
# Big Arrow
gen_feature_set.add_feature(feature, sigil="BIGARROW")
23.5 Restriction Map
Add restriction sites to your small genome
# old
from reportlab.lib import colors
from reportlab.lib.units import cm
from Bio.Graphics import GenomeDiagram
from Bio import SeqIO
# new
from Bio.SeqFeature import SeqFeature, FeatureLocation
# old
= SeqIO.read("YOUR GB FILE HERE", "genbank")
record
# old
= GenomeDiagram.Diagram(record.id)
gd_diagram = gd_diagram.new_track(1, name="Annotated Features")
gd_track_for_features = gd_track_for_features.new_set()
gd_feature_set
# old
for feature in record.features:
if feature.type != "gene":
# Exclude this feature
continueif len(gd_feature_set) % 2 == 0:
= colors.blue
color else:
= colors.lightblue
color gd_feature_set.add_feature(
sigil="ARROW", color=color, label=True, label_size=14, label_angle=0
feature,
)
# new: Add restriction enzymes and sites
for site, name, color in [
"GAATTC", "EcoRI", colors.green),
("CCCGGG", "SmaI", colors.orange),
("AAGCTT", "HindIII", colors.red),
("GGATCC", "BamHI", colors.purple),
(:
]= 0
index while True:
= record.seq.find(site, start=index)
index if index == -1:
break
= SeqFeature(FeatureLocation(index, index + len(site)))
feature gd_feature_set.add_feature(
feature,color=color,
name=name,
label=True,
label_size=10,
label_color=color,
)+= len(site)
index
# old, linear
gd_diagram.draw(format="linear", pagesize="A4", fragments=4, start=0, end=len(record))
gd_diagram.write("example2.pdf", "PDF")
gd_diagram.write("example2.eps", "EPS")
gd_diagram.write("example2.svg", "SVG")
# old, circular
gd_diagram.draw(
format="circular",
circular=True,
pagesize=(20 * cm, 20 * cm),
start=0,
end=len(record),
circle_core=0.5,
)gd_diagram.write("example2.pdf", "PDF")
gd_diagram.write("example2.eps", "EPS")
gd_diagram.write("example2.svg", "SVG")
23.6 Tracks on Tracks
I will use the following genbank file for my example of stacked tracks along with my previous file. You are again free to use your own genome/genome segment of choice.
from Bio import SeqIO
= SeqIO.read("XL.gb", "gb")
A_rec = SeqIO.read("XT.gb", "gb")
B_rec = SeqIO.read("NP.gb", "gb")
C_rec
import (
from reportlab.lib.colors
red,
grey,
orange,
green,
brown,
blue,
lightblue,
purple,
)
= (
A_colors * 5
[red] + [grey] * 7
+ [orange] * 2
+ [grey] * 2
+ [orange]
+ [grey] * 11
+ [green] * 4
+ [grey]
+ [green] * 2
+ [grey, green]
+ [brown] * 5
+ [blue] * 4
+ [lightblue] * 5
+ [grey, lightblue]
+ [purple] * 2
+ [grey]
)= (
B_colors * 6
[red] + [grey] * 8
+ [orange] * 2
+ [grey]
+ [orange]
+ [grey] * 21
+ [green] * 5
+ [grey]
+ [brown] * 4
+ [blue] * 3
+ [lightblue] * 3
+ [grey] * 5
+ [purple] * 2
)= (
C_colors * 30
[grey] + [green] * 5
+ [brown] * 4
+ [blue] * 2
+ [grey, blue]
+ [lightblue] * 2
+ [grey] * 5
)
from Bio.Graphics import GenomeDiagram
= "Stacked"
name = GenomeDiagram.Diagram(name)
gd_diagram = 0
max_len for record, gene_colors in zip([A_rec, B_rec, C_rec], [A_colors, B_colors, C_colors]):
= max(max_len, len(record))
max_len = gd_diagram.new_track(
gd_track_for_features 1, name=record.name, greytrack=True, start=0, end=len(record)
)= gd_track_for_features.new_set()
gd_feature_set = 0
i for feature in record.features:
if feature.type != "gene":
# Exclude this feature
continuegd_feature_set.add_feature(
feature,sigil="ARROW",
color=gene_colors[i],
label=True,
name=str(i + 1),
label_position="start",
label_size=6,
label_angle=0,
)+= 1
i
gd_diagram.draw(format="linear", pagesize="A4", fragments=1, start=0, end=max_len)
gd_diagram.write(name + ".pdf", "PDF")
gd_diagram.write(name + ".eps", "EPS")
gd_diagram.write(name + ".svg", "SVG")
23.7 Cross-link Data
Time to find similarities and differences.
= [
A_vs_B 99, "ND1", "ND1"),
(33, "ND2", "ND2"),
(94, "COX1", "COX1"),
(100, "COX2", "COX2"),
(97, "ATP8", "ATP8"),
(98, "ATP6", "ATP6"),
(98, "COX3", "COX3"),
(100, "ND3", "ND3"),
(100, "ND4L", "ND4L"),
(94, "ND4", "ND4"),
(87, "ND5", "ND5"),
(94, "ND6", "ND6"),
(94, "CYTB", "CYTB"),
(
]
= [
B_vs_C 49, "ND1", "ND1"),
(53, "ND2", "ND2"),
(64, "COX1", "COX1"),
(30, "COX2", "COX2"),
(27, "ATP8", "ATP8"),
(88, "ATP6", "ATP6"),
(68, "COX3", "COX3"),
(30, "ND3", "ND3"),
(100, "ND4L", "ND4L"),
(74, "ND4", "ND4"),
(37, "ND5", "ND5"),
(74, "ND6", "ND6"),
(64, "CYTB", "CYTB"),
(
]
get_feature(features, id, tags=["locus_tag", "gene"]):
def """Search list of SeqFeature objects for an identifier under the given tags."""
for f in features:
for key in tags:
# tag may not be present in this feature
for x in f.qualifiers.get(key, []):
if x == id:
return fKeyError(id)
raise
from Bio.Graphics.GenomeDiagram import CrossLink
from reportlab.lib import colors
for rec_X, tn_X, rec_Y, tn_Y, X_vs_Y in [
3, B_rec, 2, A_vs_B),
(A_rec, 2, C_rec, 1, B_vs_C),
(B_rec, :
]= gd_diagram.tracks[tn_X]
track_X = gd_diagram.tracks[tn_Y]
track_Y for score, id_X, id_Y in X_vs_Y:
= get_feature(rec_X.features, id_X)
feature_X = get_feature(rec_Y.features, id_Y)
feature_Y = colors.linearlyInterpolatedColor(
color 0, 100, score
colors.white, colors.firebrick,
)= CrossLink(
link_xy
(track_X, feature_X.location.start, feature_X.location.end),
(track_Y, feature_Y.location.start, feature_Y.location.end),
color,
colors.lightgrey,
)gd_diagram.cross_track_links.append(link_xy)
gd_diagram.draw(format="linear", pagesize="A4", fragments=1, start=0, end=max_len)
gd_diagram.write(name + ".pdf", "PDF")
gd_diagram.write(name + ".eps", "EPS")
gd_diagram.write(name + ".svg", "SVG")
23.8 Clean Cross-Link
from reportlab.lib import colors
from reportlab.lib.colors import red, grey, orange, green, brown
from reportlab.lib.colors import blue, lightblue, purple
from Bio.Graphics import GenomeDiagram
from Bio.Graphics.GenomeDiagram import CrossLink
from Bio import SeqIO
from Bio.SeqFeature import SeqFeature, FeatureLocation
= "Example3"
name
= SeqIO.read("XL.gb", "gb")
A_rec = SeqIO.read("XT.gb", "gb")
B_rec = SeqIO.read("NP.gb", "gb")
C_rec = {rec.name: rec for rec in [A_rec, B_rec, C_rec]}
records
= (
A_colors
[red]+ [grey]
+ [orange]
+ [grey]
+ [orange]
+ [grey]
+ [green]
+ [grey]
+ [green]
+ [grey, green]
+ [brown]
+ [blue]
+ [lightblue]
+ [grey, lightblue]
+ [purple]
+ [grey]
)= (
B_colors
[red]+ [grey]
+ [orange]
+ [grey]
+ [orange]
+ [grey]
+ [green]
+ [grey]
+ [brown]
+ [blue]
+ [lightblue]
+ [grey]
+ [purple]
)= (
C_colors
[grey]+ [green]
+ [brown]
+ [blue]
+ [grey, blue]
+ [lightblue]
+ [grey] * 8
)
= [
A_vs_B 99, "ND1", "ND1"),
(33, "ND2", "ND2"),
(94, "COX1", "COX1"),
(100, "COX2", "COX2"),
(97, "ATP8", "ATP8"),
(98, "ATP6", "ATP6"),
(98, "COX3", "COX3"),
(100, "ND3", "ND3"),
(100, "ND4L", "ND4L"),
(94, "ND4", "ND4"),
(87, "ND5", "ND5"),
(94, "ND6", "ND6"),
(94, "CYTB", "CYTB"),
(
]
= [
B_vs_C 49, "ND1", "ND1"),
(53, "ND2", "ND2"),
(64, "COX1", "COX1"),
(30, "COX2", "COX2"),
(27, "ATP8", "ATP8"),
(88, "ATP6", "ATP6"),
(68, "COX3", "COX3"),
(30, "ND3", "ND3"),
(100, "ND4L", "ND4L"),
(74, "ND4", "ND4"),
(37, "ND5", "ND5"),
(74, "ND6", "ND6"),
(64, "CYTB", "CYTB"),
(
]
get_feature(features, id, tags=("locus_tag", "gene", "old_locus_tag")):
def """Search list of SeqFeature objects for an identifier under the given tags."""
for f in features:
for key in tags:
# tag may not be present in this feature
for x in f.qualifiers.get(key, []):
if x == id:
return fKeyError(id)
raise
= GenomeDiagram.Diagram(name)
gd_diagram = {}
feature_sets = 0
max_len for i, record in enumerate([A_rec, B_rec, C_rec]):
= max(max_len, len(record))
max_len # Allocate tracks 5 (top), 3, 1 (bottom) for A, B, C
# (empty tracks 2 and 4 add useful white space to emphasise the cross links
# and also serve to make the tracks vertically more compressed)
= gd_diagram.new_track(
gd_track_for_features 5 - 2 * i,
name=record.name,
greytrack=True,
height=0.5,
start=0,
end=len(record),
)in feature_sets
assert record.name not = gd_track_for_features.new_set()
feature_sets[record.name]
for X, Y, X_vs_Y in [
"NC_001573", "NC_006839", A_vs_B),
("NC_006839", "NC_026789", B_vs_C),
(:
]= records[X].features
features_X = records[Y].features
features_Y = feature_sets[X]
set_X = feature_sets[Y]
set_Y for score, x, y in X_vs_Y:
= colors.linearlyInterpolatedColor(
color 0, 100, score
colors.white, colors.firebrick,
)= colors.lightgrey
border = get_feature(features_X, x)
f_x = set_X.add_feature(
F_x SeqFeature(FeatureLocation(f_x.location.start, f_x.location.end, strand=0)),
color=color,
border=border,
)= get_feature(features_Y, y)
f_y = set_Y.add_feature(
F_y SeqFeature(FeatureLocation(f_y.location.start, f_y.location.end, strand=0)),
color=color,
border=border,
)gd_diagram.cross_track_links.append(CrossLink(F_x, F_y, color, border))
for record, gene_colors in zip([A_rec, B_rec, C_rec], [A_colors, B_colors, C_colors]):
= feature_sets[record.name]
gd_feature_set = 0
i for feature in record.features:
if feature.type != "gene":
# Exclude this feature
continue:
try= gene_colors[i]
g_color :
except IndexErrorprint("Don't have color for %s gene %i" % (record.name, i))
= grey
g_color gd_feature_set.add_feature(
feature,sigil="BIGARROW",
color=g_color,
label=True,
name=str(i + 1),
label_position="start",
label_size=6,
label_angle=0,
)+= 1
i
gd_diagram.draw(format="linear", pagesize="A4", fragments=1, start=0, end=max_len)
gd_diagram.write(name + ".pdf", "PDF")
gd_diagram.write(name + ".eps", "EPS")
gd_diagram.write(name + ".svg", "SVG")