Comparing two genomes in a dotplot

Decomposing genomes in separate blocks provides a very good starting point for pairwise comparison between genomes. The pangenome graph can easily be used to draw a dotplot between two different paths, in which lines represent shared blocks.

In this example we consider the klebs_pangraph.json graph generated from 9 complete chromosomes of Klebsiella Pneumoniae in a previous tutorial.

PyPangraph provides a convenient dotplot function to generate such a dotplot:

import pypangraph as pp
import matplotlib.pyplot as plt

# load pangraph
pan = pp.Pangraph.from_json("klebs_pangraph.json")

# paths to be compared
str_i, str_j = ["NZ_CP013711", "NC_017540"]

fig, ax = plt.subplots(figsize=(8, 8))
pp.dotplot(ax, str_i, str_j, pan, duplicated_color="silver", min_length=150)
plt.show()

Click here to see the full code, in case you want to customize it

import pypangraph as pp
import matplotlib.pyplot as plt
from collections import defaultdict
from itertools import product
from dataclasses import dataclass

@dataclass
class Segment:
    """
    Representation for a node of a path as a segment,
    with a start and end coordinate and a strandedness.
    """

    start: int
    end: int
    strand: bool

    def flip(self):
        """
        Flip the segment orientation.
        Exchange start and end coordinates and flip the strandedness.
        """
        return Segment(self.end, self.start, not self.strand)

    def length(self, L):
        """
        Calculate the length of the segment.
        If total path length L is provided, calculate the length modulo L.
        """
        if L is None:
            return self.end - self.start
        else:
            return (self.end - self.start) % L

    def crosses_origin(self):
        """
        Check if the segment crosses the origin, i.e. start > end
        """
        return self.start > self.end

    def split_across_origin(self, L):
        """
        Split the segment into two segments if it crosses the origin.
        """
        assert self.crosses_origin()
        s1 = Segment(self.start, L, self.strand)
        s2 = Segment(0, self.end, self.strand)
        f = s1.length(L) / self.length(L)
        return s1, s2, f

    def split_fraction(self, f, flip, L):
        """
        Split the segment into two segments based on a fraction of the segment length
        """
        assert 0 <= f <= 1
        l = self.length(L)
        l1, l2 = f * l, (1 - f) * l
        if flip:
            l1, l2 = l2, l1
        s1 = Segment(self.start, (self.start + l1) % L, self.strand)
        s2 = Segment((self.end - l2) % L, self.end, self.strand)
        return s1, s2


def block_segment_dictionary(graph, path_name):
    """
    Create a dictionary of block -> segments for a given path in the graph.
    """
    path_id = graph.paths[path_name].id
    block_sgm = defaultdict(list)
    for bid, block in graph.blocks.items():
        for nid in block.alignment.node_ids():
            node = graph.nodes[nid]
            if node.path_id == path_id:
                sgm = Segment(node.start, node.end, node.strand)
                block_sgm[bid].append(sgm)
    return dict(block_sgm)


def linear_plot(ax, sgm_i, sgm_j, **kwargs):
    """
    Dotplot for two segments
    """
    if sgm_i.strand != sgm_j.strand:
        linear_plot(ax, sgm_i, sgm_j.flip(), **kwargs)
    else:
        ax.plot([sgm_i.start, sgm_i.end], [sgm_j.start, sgm_j.end], **kwargs)


def circular_plot(ax, sgm_i, sgm_j, Li, Lj, **kwargs):
    """
    Dotplot for two segments, considering circular genomes.
    """
    if sgm_i.crosses_origin():
        si1, si2, f = sgm_i.split_across_origin(Li)
        flip = sgm_i.strand != sgm_j.strand
        sj1, sj2 = sgm_j.split_fraction(f, flip, Lj)
        circular_plot(ax, si1, sj1, Li, Lj, **kwargs)
        circular_plot(ax, si2, sj2, Li, Lj, **kwargs)
    elif sgm_j.crosses_origin():
        sj1, sj2, f = sgm_j.split_across_origin(Lj)
        flip = sgm_i.strand != sgm_j.strand
        si1, si2 = sgm_i.split_fraction(f, flip, Li)
        circular_plot(ax, si1, sj1, Li, Lj, **kwargs)
        circular_plot(ax, si2, sj2, Li, Lj, **kwargs)
    else:
        linear_plot(ax, sgm_i, sgm_j, **kwargs)


def dotplot(
    ax,
    strain_i,
    strain_j,
    graph,
    block_color="black",
    no_duplicates=False,
    duplicated_color="silver",
    min_length=None,
    circular=True,
):
    """
    Creates a dotplot comparing two paths.

    Parameters:
    ax (matplotlib.axes.Axes): The matplotlib axes object where the plot will be drawn.
    strain_i (str): The identifier for the first path.
    strain_j (str): The identifier for the second path.
    graph (Graph): pangenome graph object.
    block_color (str, optional): The color used for non-duplicated blocks. Defaults to "black".
    no_duplicates (bool, optional): If True, duplicated blocks will not be plotted. Defaults to False.
    duplicated_color (str, optional): The color used for duplicated blocks. Defaults to "silver".
    min_length (int, optional): Minimum length of blocks to be plotted. Defaults to None.
    circular (bool, optional): If True, plots the segments in a circular layout. If False, uses a linear layout. Defaults to True.
    """
    # Create block segment dictionaries for the paths
    bs_i = block_segment_dictionary(graph, strain_i)
    bs_j = block_segment_dictionary(graph, strain_j)

    # Get the nucleotide lengths of the paths
    Li = graph.paths[strain_i].nuc_len
    Lj = graph.paths[strain_j].nuc_len

    # Plot the segments
    for bid, seg_i in bs_i.items():
        if bid not in bs_j:
            continue
        color = block_color
        seg_j = bs_j[bid]

        if min_length is not None:
            # optionally skip short blocks
            block_len = len(graph.blocks[bid].consensus())
            if block_len < min_length:
                continue

        if (len(seg_i) > 1) or (len(seg_j) > 1):
            if no_duplicates:
                continue
            # color duplicated blocks in gray
            color = duplicated_color

        for si, sj in product(seg_i, seg_j):
            if circular:
                circular_plot(ax, si, sj, Li, Lj, color=color)
            else:
                linear_plot(ax, si, sj, color=color)

# Load the Pangraph from a JSON file
pan = pp.Pangraph.from_json("klebs_pangraph.json")

# paths to be compared
str_i, str_j = ["NZ_CP013711", "NC_017540"]

fig, ax = plt.subplots(figsize=(8, 8))
dotplot(ax, str_i, str_j, pan)
plt.tight_layout()
plt.show()

dotplot

Block that are not duplicated between the two paths are displayed in black. We observe that the chromosomes differ in a large inversion.

Duplicated blocks are depicted in gray. These form a checkerboard pattern, typical of blocks that are repeated multiple times on the genome. We can get the list of most duplicated blocks using the to_blockstats_df method:

df = pan.to_blockstats_df()
df[df["duplicated"]].sort_values("count", ascending=False).head(3)

# block_id              count  n_strains  duplicated   core   len
# 3121046370622183165     101          9        True  False   236
# 17902739066067298048     72          9        True  False  1733
# 9722346788777533867      72          9        True  False  3062

It is also easy to retrieve the consensus sequence of one of these blocks with:

b = pan.blocks[17902739066067298048]
print(b.consensus())
# TTCATCAGACAATCTGTGTGAGCACTACAAAGGCAGGTTCTTTAAGGTAAGGA...

Nucleotide blast of this sequence reveals a match with the 16S ribosomal RNA gene.

blast