Latest Threat Research:SANDWORM_MODE: Shai-Hulud-Style npm Worm Hijacks CI Workflows and Poisons AI Toolchains.Details
Socket
Book a DemoInstallSign in
Socket

selfies

Package Overview
Dependencies
Maintainers
1
Versions
16
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

selfies - npm Package Compare versions

Comparing version
1.0.4
to
2.0.0
+199
selfies/bond_constraints.py
import functools
from itertools import product
from typing import Dict, Set, Union
from selfies.constants import ELEMENTS, INDEX_ALPHABET
_DEFAULT_CONSTRAINTS = {
"H": 1, "F": 1, "Cl": 1, "Br": 1, "I": 1,
"B": 3, "B+1": 2, "B-1": 4,
"O": 2, "O+1": 3, "O-1": 1,
"N": 3, "N+1": 4, "N-1": 2,
"C": 4, "C+1": 5, "C-1": 3,
"P": 5, "P+1": 6, "P-1": 4,
"S": 6, "S+1": 7, "S-1": 5,
"?": 8
}
_PRESET_CONSTRAINTS = {
"default": dict(_DEFAULT_CONSTRAINTS),
"octet_rule": dict(_DEFAULT_CONSTRAINTS),
"hypervalent": dict(_DEFAULT_CONSTRAINTS)
}
_PRESET_CONSTRAINTS["octet_rule"].update(
{"S": 2, "S+1": 3, "S-1": 1, "P": 3, "P+1": 4, "P-1": 2}
)
_PRESET_CONSTRAINTS["hypervalent"].update(
{"Cl": 7, "Br": 7, "I": 7, "N": 5}
)
_current_constraints = _PRESET_CONSTRAINTS["default"]
def get_preset_constraints(name: str) -> Dict[str, int]:
"""Returns the preset semantic constraints with the given name.
Besides the aforementioned default constraints, :mod:`selfies` offers
other preset constraints for convenience; namely, constraints that
enforce the `octet rule <https://en.wikipedia.org/wiki/Octet_rule>`_
and constraints that accommodate `hypervalent molecules
<https://en.wikipedia.org/wiki/Hypervalent_molecule>`_.
The differences between these constraints can be summarized as follows:
.. table::
:align: center
:widths: auto
+-----------------+-----------+---+---+-----+-----+---+-----+-----+
| | Cl, Br, I | N | P | P+1 | P-1 | S | S+1 | S-1 |
+-----------------+-----------+---+---+-----+-----+---+-----+-----+
| ``default`` | 1 | 3 | 5 | 6 | 4 | 6 | 7 | 5 |
+-----------------+-----------+---+---+-----+-----+---+-----+-----+
| ``octet_rule`` | 1 | 3 | 3 | 4 | 2 | 2 | 3 | 1 |
+-----------------+-----------+---+---+-----+-----+---+-----+-----+
| ``hypervalent`` | 7 | 5 | 5 | 6 | 4 | 6 | 7 | 5 |
+-----------------+-----------+---+---+-----+-----+---+-----+-----+
:param name: the preset name: ``default`` or ``octet_rule`` or
``hypervalent``.
:return: the preset constraints with the specified name, represented
as a dictionary which maps atoms (the keys) to their bonding capacities
(the values).
"""
if name not in _PRESET_CONSTRAINTS:
raise ValueError("unrecognized preset name '{}'".format(name))
return dict(_PRESET_CONSTRAINTS[name])
def get_semantic_constraints() -> Dict[str, int]:
"""Returns the semantic constraints that :mod:`selfies` is currently
operating on.
:return: the current semantic constraints, represented as a dictionary
which maps atoms (the keys) to their bonding capacities (the values).
"""
global _current_constraints
return dict(_current_constraints)
def set_semantic_constraints(
bond_constraints: Union[str, Dict[str, int]] = "default"
) -> None:
"""Updates the semantic constraints that :mod:`selfies` operates on.
If the input is a string, the new constraints are taken to be
the preset named ``bond_constraints``
(see :func:`selfies.get_preset_constraints`).
Otherwise, the input is a dictionary representing the new constraints.
This dictionary maps atoms (the keys) to non-negative bonding
capacities (the values); the atoms are specified by strings
of the form ``E`` or ``E+C`` or ``E-C``,
where ``E`` is an element symbol and ``C`` is a positive integer.
For example, one may have:
* ``bond_constraints["I-1"] = 0``
* ``bond_constraints["C"] = 4``
This dictionary must also contain the special ``?`` key, which indicates
the bond capacities of all atoms that are not explicitly listed
in the dictionary.
:param bond_constraints: the name of a preset, or a dictionary
representing the new semantic constraints.
:return: ``None``.
"""
global _current_constraints
if isinstance(bond_constraints, str):
_current_constraints = get_preset_constraints(bond_constraints)
elif isinstance(bond_constraints, dict):
# error checking
if "?" not in bond_constraints:
raise ValueError("bond_constraints missing '?' as a key")
for key, value in bond_constraints.items():
# error checking for keys
j = max(key.find("+"), key.find("-"))
if key == "?":
valid = True
elif j == -1:
valid = (key in ELEMENTS)
else:
valid = (key[:j] in ELEMENTS) and key[j + 1:].isnumeric()
if not valid:
err_msg = "invalid key '{}' in bond_constraints".format(key)
raise ValueError(err_msg)
# error checking for values
if not (isinstance(value, int) and value >= 0):
err_msg = "invalid value at " \
"bond_constraints['{}'] = {}".format(key, value)
raise ValueError(err_msg)
_current_constraints = dict(bond_constraints)
else:
raise ValueError("bond_constraints must be a str or dict")
# clear cache since we changed alphabet
get_semantic_robust_alphabet.cache_clear()
get_bonding_capacity.cache_clear()
@functools.lru_cache()
def get_semantic_robust_alphabet() -> Set[str]:
"""Returns a subset of all SELFIES symbols that are constrained
by :mod:`selfies` under the current semantic constraints.
:return: a subset of all SELFIES symbols that are semantically constrained.
"""
alphabet_subset = set()
bonds = {"": 1, "=": 2, "#": 3}
# add atomic symbols
for (a, c), (b, m) in product(_current_constraints.items(), bonds.items()):
if (m > c) or (a == "?"):
continue
symbol = "[{}{}]".format(b, a)
alphabet_subset.add(symbol)
# add branch and ring symbols
for i in range(1, 4):
alphabet_subset.add("[Ring{}]".format(i))
alphabet_subset.add("[=Ring{}]".format(i))
alphabet_subset.add("[Branch{}]".format(i))
alphabet_subset.add("[=Branch{}]".format(i))
alphabet_subset.add("[#Branch{}]".format(i))
alphabet_subset.update(INDEX_ALPHABET)
return alphabet_subset
@functools.lru_cache()
def get_bonding_capacity(element: str, charge: int) -> int:
"""Returns the bonding capacity of a given atom, under the current
semantic constraints.
:param element: the element of the input atom.
:param charge: the charge of the input atom.
:return: the bonding capacity of the input atom.
"""
key = element
if charge != 0:
key += "{:+}".format(charge)
if key in _current_constraints:
return _current_constraints[key]
else:
return _current_constraints["?"]
from selfies.utils.smiles_utils import atom_to_smiles, smiles_to_atom
def modernize_symbol(symbol):
"""Converts a SELFIES symbol from <v2 to its latest equivalent.
:param symbol: an old SELFIES symbol.
:return: the latest equivalent of the input symbol, or the input symbol
itself, if no such equivalent exists.
"""
if symbol in _SYMBOL_UPDATE_TABLE:
return _SYMBOL_UPDATE_TABLE[symbol]
if symbol[-5:] == "expl]": # e.g. [XXXexpl]
if symbol[1] in "=#/\\":
bond_char, atom_symbol = symbol[1], symbol[2:-5]
else:
bond_char, atom_symbol = "", symbol[1:-5]
atom = smiles_to_atom("[{}]".format(atom_symbol))
if (atom is not None) and (not atom.is_aromatic):
atom_symbol = atom_to_smiles(atom, brackets=False) # standardize
symbol = "[{}{}]".format(bond_char, atom_symbol)
return symbol
def _build_update_table():
update_table = dict()
for L in range(1, 4):
entries = [
("[Branch{}_1]", "[Branch{}]"),
("[Branch{}_2]", "[=Branch{}]"),
("[Branch{}_3]", "[#Branch{}]"),
("[Expl=Ring{}]", "[=Ring{}]"),
("[Expl#Ring{}]", "[#Ring{}]"),
("[Expl/Ring{}]", "[//Ring{}]"),
("[Expl\\Ring{}]", "[\\\\Ring{}]")
]
for old, new in entries:
update_table[old.format(L)] = new.format(L)
return update_table
_SYMBOL_UPDATE_TABLE = _build_update_table()
ELEMENTS = {
"H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg",
"Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca", "Sc", "Ti", "V", "Cr",
"Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br",
"Kr", "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd",
"Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe", "Cs", "Ba", "Hf",
"Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi",
"Po", "At", "Rn", "Fr", "Ra", "Rf", "Db", "Sg", "Bh", "Hs", "Mt",
"Ds", "Rg", "Cn", "Fl", "Lv", "La", "Ce", "Pr", "Nd", "Pm", "Sm",
"Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Ac", "Th",
"Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md",
"No", "Lr"
}
ORGANIC_SUBSET = {"B", "C", "N", "O", "S", "P", "F", "Cl", "Br", "I"}
AROMATIC_VALENCES = {
"B": (3,), "Al": (3,),
"C": (4,), "Si": (4,),
"N": (3, 5), "P": (3, 5), "As": (3, 5),
"O": (2, 4), "S": (2, 4), "Se": (2, 4), "Te": (2, 4)
}
AROMATIC_SUBSET = set(e.lower() for e in AROMATIC_VALENCES)
# =============================================================================
# SELFIES-specific constants
# =============================================================================
INDEX_ALPHABET = (
"[C]", "[Ring1]", "[Ring2]",
"[Branch1]", "[=Branch1]", "[#Branch1]",
"[Branch2]", "[=Branch2]", "[#Branch2]",
"[O]", "[N]", "[=N]", "[=C]", "[#C]", "[S]", "[P]"
)
INDEX_CODE = {c: i for i, c in enumerate(INDEX_ALPHABET)}
class SMILESParserError(ValueError):
"""Exception raised when a SMILES fails to be parsed.
"""
def __init__(self, smiles, reason="N/A", idx=-1):
self.smiles = smiles
self.idx = idx
self.reason = reason
def __str__(self):
err_msg = "\n" \
"\tSMILES: {smiles}\n" \
"\t {pointer}\n" \
"\tIndex: {index}\n" \
"\tReason: {reason}"
return err_msg.format(
smiles=self.smiles,
pointer=(" " * self.idx + "^"),
index=self.idx,
reason=self.reason
)
class EncoderError(Exception):
"""Exception raised by :func:`selfies.encoder`.
"""
pass
class DecoderError(Exception):
"""Exception raised by :func:`selfies.decoder`.
"""
pass
import functools
import itertools
from typing import List, Optional, Union
from selfies.bond_constraints import get_bonding_capacity
from selfies.constants import AROMATIC_VALENCES
from selfies.utils.matching_utils import find_perfect_matching
class Atom:
"""An atom with associated specifications (e.g. charge, chirality).
"""
def __init__(
self,
element: str,
is_aromatic: bool,
isotope: Optional[int] = None,
chirality: Optional[str] = None,
h_count: Optional[int] = None,
charge: int = 0
):
self.index = None
self.element = element
self.is_aromatic = is_aromatic
self.isotope = isotope
self.chirality = chirality
self.h_count = h_count
self.charge = charge
@property
@functools.lru_cache()
def bonding_capacity(self):
bond_cap = get_bonding_capacity(self.element, self.charge)
bond_cap -= 0 if (self.h_count is None) else self.h_count
return bond_cap
def invert_chirality(self) -> None:
if self.chirality == "@":
self.chirality = "@@"
elif self.chirality == "@@":
self.chirality = "@"
class DirectedBond:
"""A bond that contains directional information.
"""
def __init__(
self,
src: int,
dst: int,
order: Union[int, float],
stereo: Optional[str],
ring_bond: bool
):
self.src = src
self.dst = dst
self.order = order
self.stereo = stereo
self.ring_bond = ring_bond
class MolecularGraph:
"""A molecular graph.
Molecules can be viewed as weighted undirected graphs. However, SMILES
and SELFIES strings are more naturally represented as weighted directed
graphs, where the direction of the edges specifies the order of atoms
and bonds in the string.
"""
def __init__(self):
self._roots = list() # stores root atoms, where traversal begins
self._atoms = list() # stores atoms in this graph
self._bond_dict = dict() # stores all bonds in this graph
self._adj_list = list() # adjacency list, representing this graph
self._bond_counts = list() # stores number of bonds an atom has made
self._ring_bond_flags = list() # stores if an atom makes a ring bond
self._delocal_subgraph = dict() # delocalization subgraph
def __len__(self):
return len(self._atoms)
def has_bond(self, a: int, b: int) -> bool:
if a > b:
a, b = b, a
return (a, b) in self._bond_dict
def has_out_ring_bond(self, src: int) -> bool:
return self._ring_bond_flags[src]
def get_roots(self) -> List[int]:
return self._roots
def get_atom(self, idx: int) -> Atom:
return self._atoms[idx]
def get_atoms(self) -> List[Atom]:
return self._atoms
def get_dirbond(self, src, dst) -> DirectedBond:
return self._bond_dict[(src, dst)]
def get_out_dirbonds(self, src: int) -> List[DirectedBond]:
return self._adj_list[src]
def get_bond_count(self, idx: int) -> int:
return self._bond_counts[idx]
def add_atom(self, atom: Atom, mark_root: bool = False) -> None:
atom.index = len(self)
if mark_root:
self._roots.append(atom.index)
self._atoms.append(atom)
self._adj_list.append(list())
self._bond_counts.append(0)
self._ring_bond_flags.append(False)
if atom.is_aromatic:
self._delocal_subgraph[atom.index] = list()
def add_bond(
self, src: int, dst: int,
order: Union[int, float], stereo: str
) -> None:
assert src < dst
bond = DirectedBond(src, dst, order, stereo, False)
self._add_bond_at_loc(bond, -1)
self._bond_counts[src] += order
self._bond_counts[dst] += order
if order == 1.5:
self._delocal_subgraph.setdefault(src, []).append(dst)
self._delocal_subgraph.setdefault(dst, []).append(src)
def add_placeholder_bond(self, src: int) -> int:
out_edges = self._adj_list[src]
out_edges.append(None)
return len(out_edges) - 1
def add_ring_bond(
self, a: int, b: int,
order: Union[int, float],
a_stereo: Optional[str], b_stereo: Optional[str],
a_pos: int = -1, b_pos: int = -1
) -> None:
a_bond = DirectedBond(a, b, order, a_stereo, True)
b_bond = DirectedBond(b, a, order, b_stereo, True)
self._add_bond_at_loc(a_bond, a_pos)
self._add_bond_at_loc(b_bond, b_pos)
self._bond_counts[a] += order
self._bond_counts[b] += order
self._ring_bond_flags[a] = True
self._ring_bond_flags[b] = True
if order == 1.5:
self._delocal_subgraph.setdefault(a, []).append(b)
self._delocal_subgraph.setdefault(b, []).append(a)
def update_bond_order(
self, a: int, b: int,
new_order: Union[int, float]
) -> None:
assert 1 <= new_order <= 3
if a > b:
a, b = b, a # swap so that a < b
a_to_b = self._bond_dict[(a, b)] # prev step guarantees existence
if new_order == a_to_b.order:
return
elif a_to_b.ring_bond:
b_to_a = self._bond_dict[(b, a)]
bonds = (a_to_b, b_to_a)
else:
bonds = (a_to_b,)
old_order = bonds[0].order
for bond in bonds:
bond.order = new_order
self._bond_counts[a] += (new_order - old_order)
self._bond_counts[b] += (new_order - old_order)
def _add_bond_at_loc(self, bond, pos):
self._bond_dict[(bond.src, bond.dst)] = bond
out_edges = self._adj_list[bond.src]
if (pos == -1) or (pos == len(out_edges)):
out_edges.append(bond)
elif out_edges[pos] is None:
out_edges[pos] = bond
else:
out_edges.insert(pos, bond)
def is_kekulized(self) -> bool:
return not self._delocal_subgraph
def kekulize(self) -> bool:
# Algorithm based on Depth-First article by Richard L. Apodaca
# Reference:
# https://depth-first.com/articles/2020/02/10/
# a-comprehensive-treatment-of-aromaticity-in-the-smiles-language/
if self.is_kekulized():
return True
ds = self._delocal_subgraph
kept_nodes = set(itertools.filterfalse(self._prune_from_ds, ds))
# relabel kept DS nodes to be 0, 1, 2, ...
label_to_node = list(sorted(kept_nodes))
node_to_label = {v: i for i, v in enumerate(label_to_node)}
# pruned and relabelled DS
pruned_ds = [list() for _ in range(len(kept_nodes))]
for node in kept_nodes:
label = node_to_label[node]
for adj in filter(lambda v: v in kept_nodes, ds[node]):
pruned_ds[label].append(node_to_label[adj])
matching = find_perfect_matching(pruned_ds)
if matching is None:
return False
# de-aromatize and then make double bonds
for node in ds:
for adj in ds[node]:
self.update_bond_order(node, adj, new_order=1)
self._atoms[node].is_aromatic = False
self._bond_counts[node] = int(self._bond_counts[node])
for matched_labels in enumerate(matching):
matched_nodes = tuple(label_to_node[i] for i in matched_labels)
self.update_bond_order(*matched_nodes, new_order=2)
self._delocal_subgraph = dict() # clear DS
return True
def _prune_from_ds(self, node):
adj_nodes = self._delocal_subgraph[node]
if not adj_nodes:
return True # aromatic atom with no aromatic bonds
atom = self._atoms[node]
valences = AROMATIC_VALENCES[atom.element]
# each bond in DS has order 1.5 - we treat them as single bonds
used_electrons = int(self._bond_counts[node] - 0.5 * len(adj_nodes))
if atom.h_count is None: # account for implicit Hs
assert atom.charge == 0
return any(used_electrons == v for v in valences)
else:
valence = valences[-1] - atom.charge
used_electrons += atom.h_count
free_electrons = valence - used_electrons
return not ((free_electrons >= 0) and (free_electrons % 2 != 0))
from typing import Dict, List, Tuple, Union
from selfies.utils.selfies_utils import len_selfies, split_selfies
def selfies_to_encoding(
selfies: str,
vocab_stoi: Dict[str, int],
pad_to_len: int = -1,
enc_type: str = 'both'
) -> Union[List[int], List[List[int]], Tuple[List[int], List[List[int]]]]:
"""Converts a SELFIES string into its label (integer)
and/or one-hot encoding.
A label encoded output will be a list of shape ``(L,)`` and a
one-hot encoded output will be a 2D list of shape ``(L, len(vocab_stoi))``,
where ``L`` is the symbol length of the SELFIES string. Optionally,
the SELFIES string can be padded before it is encoded.
:param selfies: the SELFIES string to be encoded.
:param vocab_stoi: a dictionary that maps SELFIES symbols to indices,
which must be non-negative and contiguous, starting from 0.
If the SELFIES string is to be padded, then the special padding symbol
``[nop]`` must also be a key in this dictionary.
:param pad_to_len: the length that the SELFIES string string is padded to.
If this value is less than or equal to the symbol length of the
SELFIES string, then no padding is added. Defaults to ``-1``.
:param enc_type: the type of encoding of the output:
``label`` or ``one_hot`` or ``both``.
If this value is ``both``, then a tuple of the label and one-hot
encodings is returned. Defaults to ``both``.
:return: the label encoded and/or one-hot encoded SELFIES string.
:Example:
>>> import selfies as sf
>>> sf.selfies_to_encoding("[C][F]", {"[C]": 0, "[F]": 1})
([0, 1], [[1, 0], [0, 1]])
"""
# some error checking
if enc_type not in ("label", "one_hot", "both"):
raise ValueError("enc_type must be in ('label', 'one_hot', 'both')")
# pad with [nop]
if pad_to_len > len_selfies(selfies):
selfies += "[nop]" * (pad_to_len - len_selfies(selfies))
# integer encode
char_list = split_selfies(selfies)
integer_encoded = [vocab_stoi[char] for char in char_list]
if enc_type == "label":
return integer_encoded
# one-hot encode
one_hot_encoded = list()
for index in integer_encoded:
letter = [0] * len(vocab_stoi)
letter[index] = 1
one_hot_encoded.append(letter)
if enc_type == "one_hot":
return one_hot_encoded
return integer_encoded, one_hot_encoded
def encoding_to_selfies(
encoding: Union[List[int], List[List[int]]],
vocab_itos: Dict[int, str],
enc_type: str,
) -> str:
"""Converts a label (integer) or one-hot encoding into a SELFIES string.
If the input is label encoded, then a list of shape ``(L,)`` is
expected; and if the input is one-hot encoded, then a 2D list of
shape ``(L, len(vocab_itos))`` is expected.
:param encoding: a label or one-hot encoding.
:param vocab_itos: a dictionary that maps indices to SELFIES symbols.
The indices of this dictionary must be non-negative and contiguous,
starting from 0.
:param enc_type: the type of encoding of the input:
``label`` or ``one_hot``.
:return: the SELFIES string represented by the input encoding.
:Example:
>>> import selfies as sf
>>> one_hot = [[0, 1, 0], [0, 0, 1], [1, 0, 0]]
>>> vocab_itos = {0: "[nop]", 1: "[C]", 2: "[F]"}
>>> sf.encoding_to_selfies(one_hot, vocab_itos, enc_type="one_hot")
'[C][F][nop]'
"""
if enc_type not in ("label", "one_hot"):
raise ValueError("enc_type must be in ('label', 'one_hot')")
if enc_type == "one_hot": # Get integer encoding
integer_encoded = []
for row in encoding:
integer_encoded.append(row.index(1))
else:
integer_encoded = encoding
# Integer encoding -> SELFIES
char_list = [vocab_itos[i] for i in integer_encoded]
selfies = "".join(char_list)
return selfies
def batch_selfies_to_flat_hot(
selfies_batch: List[str],
vocab_stoi: Dict[str, int],
pad_to_len: int = -1,
) -> List[List[int]]:
"""Converts a list of SELFIES strings into its list of flattened
one-hot encodings.
Each SELFIES string in the input list is one-hot encoded
(and then flattened) using :func:`selfies.selfies_to_encoding`, with
``vocab_stoi`` and ``pad_to_len`` being passed in as arguments.
:param selfies_batch: the list of SELFIES strings to be encoded.
:param vocab_stoi: a dictionary that maps SELFIES symbols to indices.
:param pad_to_len: the length that each SELFIES string in the input list
is padded to. Defaults to ``-1``.
:return: the flattened one-hot encodings of the input list.
:Example:
>>> import selfies as sf
>>> batch = ["[C]", "[C][C]"]
>>> vocab_stoi = {"[nop]": 0, "[C]": 1}
>>> sf.batch_selfies_to_flat_hot(batch, vocab_stoi, 2)
[[0, 1, 1, 0], [0, 1, 0, 1]]
"""
hot_list = list()
for selfies in selfies_batch:
one_hot = selfies_to_encoding(selfies, vocab_stoi, pad_to_len,
enc_type="one_hot")
flattened = [elem for vec in one_hot for elem in vec]
hot_list.append(flattened)
return hot_list
def batch_flat_hot_to_selfies(
one_hot_batch: List[List[int]],
vocab_itos: Dict[int, str],
) -> List[str]:
"""Converts a list of flattened one-hot encodings into a list
of SELFIES strings.
Each encoding in the input list is unflattened and then decoded using
:func:`selfies.encoding_to_selfies`, with ``vocab_itos`` being passed in
as an argument.
:param one_hot_batch: a list of flattened one-hot encodings. Each
encoding must be a list of length divisible by ``len(vocab_itos)``.
:param vocab_itos: a dictionary that maps indices to SELFIES symbols.
:return: the list of SELFIES strings represented by the input encodings.
:Example:
>>> import selfies as sf
>>> batch = [[0, 1, 1, 0], [0, 1, 0, 1]]
>>> vocab_itos = {0: "[nop]", 1: "[C]"}
>>> sf.batch_flat_hot_to_selfies(batch, vocab_itos)
['[C][nop]', '[C][C]']
"""
selfies_list = []
for flat_one_hot in one_hot_batch:
# Reshape to an L x M array where each column represents an alphabet
# entry and each row is a position in the selfies
one_hot = []
M = len(vocab_itos)
if len(flat_one_hot) % M != 0:
raise ValueError("size of vector in one_hot_batch not divisible "
"by the length of the vocabulary.")
L = len(flat_one_hot) // M
for i in range(L):
one_hot.append(flat_one_hot[M * i: M * (i + 1)])
selfies = encoding_to_selfies(one_hot, vocab_itos, enc_type="one_hot")
selfies_list.append(selfies)
return selfies_list
from typing import Any
class SinglyLinkedList:
"""A simple singly linked list that supports O(1) append and O(1) extend.
"""
def __init__(self):
self._head = None
self._tail = None
self._count = 0
def __len__(self):
return self._count
def __iter__(self):
return SinglyLinkedListIterator(self)
@property
def head(self):
return self._head
def append(self, item: Any) -> None:
node = [item, None]
if self._head is None:
self._head = node
self._tail = node
else:
self._tail[1] = node
self._tail = node
self._count += 1
def extend(self, other) -> None:
assert isinstance(other, SinglyLinkedList)
if other._head is None:
return
if self._head is None:
self._head = other._head
self._tail = other._tail
else:
self._tail[1] = other._head
self._tail = other._tail
self._count += len(other)
class SinglyLinkedListIterator:
def __init__(self, linked_list):
self._curr = linked_list.head
def __iter__(self):
return self
def __next__(self):
if self._curr is None:
raise StopIteration
else:
item = self._curr[0]
self._curr = self._curr[1]
return item
import heapq
import itertools
from collections import deque
from typing import List, Optional
def find_perfect_matching(graph: List[List[int]]) -> Optional[List[int]]:
"""Finds a perfect matching for an undirected graph (without self-loops).
:param graph: an adjacency list representing the input graph.
:return: a list representing a perfect matching, where j is the i-th
element if nodes i and j are matched. Returns None, if the graph cannot
be perfectly matched.
"""
# start with a maximal matching for efficiency
matching = _greedy_matching(graph)
unmatched = set(i for i in range(len(graph)) if matching[i] is None)
while unmatched:
# find augmenting path which starts at root
root = unmatched.pop()
path = _find_augmenting_path(graph, root, matching)
if path is None:
return None
else:
_flip_augmenting_path(matching, path)
unmatched.discard(path[0])
unmatched.discard(path[-1])
return matching
def _greedy_matching(graph):
matching = [None] * len(graph)
free_degrees = [len(graph[i]) for i in range(len(graph))]
# free_degrees[i] = number of unmatched neighbors for node i
# prioritize nodes with fewer unmatched neighbors
node_pqueue = [(free_degrees[i], i) for i in range(len(graph))]
heapq.heapify(node_pqueue)
while node_pqueue:
_, node = heapq.heappop(node_pqueue)
if (matching[node] is not None) or (free_degrees[node] == 0):
continue # node cannot be matched
# match node with first unmatched neighbor
mate = next(i for i in graph[node] if matching[i] is None)
matching[node] = mate
matching[mate] = node
for adj in itertools.chain(graph[node], graph[mate]):
free_degrees[adj] -= 1
if (matching[adj] is None) and (free_degrees[adj] > 0):
heapq.heappush(node_pqueue, (free_degrees[adj], adj))
return matching
def _find_augmenting_path(graph, root, matching):
assert matching[root] is None
# run modified BFS to find path from root to unmatched node
other_end = None
node_queue = deque([root])
# parent BFS tree - None indicates an unvisited node
parents = [None] * len(graph)
parents[root] = [None, None]
while node_queue:
node = node_queue.popleft()
for adj in graph[node]:
if matching[adj] is None: # unmatched node
if adj != root: # augmenting path found!
parents[adj] = [node, adj]
other_end = adj
break
else:
adj_mate = matching[adj]
if parents[adj_mate] is None: # adj_mate not visited
parents[adj_mate] = [node, adj]
node_queue.append(adj_mate)
if other_end is not None:
break # augmenting path found!
if other_end is None:
return None
else:
path = []
node = other_end
while node != root:
path.append(parents[node][1])
path.append(parents[node][0])
node = parents[node][0]
return path
def _flip_augmenting_path(matching, path):
for i in range(0, len(path), 2):
a, b = path[i], path[i + 1]
matching[a] = b
matching[b] = a
from typing import Iterable, Iterator, Set
def len_selfies(selfies: str) -> int:
"""Returns the number of symbols in a given SELFIES string.
:param selfies: a SELFIES string.
:return: the symbol length of the SELFIES string.
:Example:
>>> import selfies as sf
>>> sf.len_selfies("[C][=C][F].[C]")
5
"""
return selfies.count("[") + selfies.count(".")
def split_selfies(selfies: str) -> Iterator[str]:
"""Tokenizes a SELFIES string into its individual symbols.
:param selfies: a SELFIES string.
:return: the symbols of the SELFIES string one-by-one with order preserved.
:Example:
>>> import selfies as sf
>>> list(sf.split_selfies("[C][=C][F].[C]"))
['[C]', '[=C]', '[F]', '.', '[C]']
"""
left_idx = selfies.find("[")
while 0 <= left_idx < len(selfies):
right_idx = selfies.find("]", left_idx + 1)
if right_idx == -1:
raise ValueError("malformed SELFIES string, hanging '[' bracket")
next_symbol = selfies[left_idx: right_idx + 1]
yield next_symbol
left_idx = right_idx + 1
if selfies[left_idx: left_idx + 1] == ".":
yield "."
left_idx += 1
def get_alphabet_from_selfies(selfies_iter: Iterable[str]) -> Set[str]:
"""Constructs an alphabet from an iterable of SELFIES strings.
The returned alphabet is the set of all symbols that appear in the
SELFIES strings from the input iterable, minus the dot ``.`` symbol.
:param selfies_iter: an iterable of SELFIES strings.
:return: an alphabet of SELFIES symbols, built from the input iterable.
:Example:
>>> import selfies as sf
>>> selfies_list = ["[C][F][O]", "[C].[O]", "[F][F]"]
>>> alphabet = sf.get_alphabet_from_selfies(selfies_list)
>>> sorted(list(alphabet))
['[C]', '[F]', '[O]']
"""
alphabet = set()
for s in selfies_iter:
for symbol in split_selfies(s):
alphabet.add(symbol)
alphabet.discard(".")
return alphabet
import enum
import re
from collections import deque
from typing import Iterator, Optional, Tuple, Union
from selfies.constants import AROMATIC_SUBSET, ELEMENTS, ORGANIC_SUBSET
from selfies.exceptions import SMILESParserError
from selfies.mol_graph import Atom, DirectedBond, MolecularGraph
SMILES_BRACKETED_ATOM_PATTERN = re.compile(
r"^[\[]" # opening square bracket [
r"(\d*)" # isotope number (optional, e.g. 123, 26)
r"([A-Za-z][a-z]?)" # element symbol
r"([@]{0,2})" # chiral_tag (optional, only @ and @@ supported)
r"((?:[H]\d?)?)" # H count (optional, e.g. H, H0, H3)
r"((?:[+]+|[-]+|[+-]\d+)?)" # charge (optional, e.g. ---, +1, ++)
r"((?:[:]\d+)?)" # atom class (optional, e.g. :12, :1)
r"[]]$" # closing square bracket ]
)
SMILES_BOND_ORDERS = {"-": 1, "/": 1, "\\": 1, ":": 1.5, "=": 2, "#": 3}
SMILES_STEREO_BONDS = {"/", "\\"}
class SMILESTokenTypes(enum.Enum):
ATOM = 0
BRANCH = 1
RING = 2
DOT = 3
class SMILESToken:
"""A token in a SMILES string, containing a symbol (atom, branch bracket,
ring number, dot) and its preceding bond, if it exists (e.g. =C, %12, #N).
"""
def __init__(
self,
bond_idx: Optional[int],
start_idx: int, end_idx: int, token_type: SMILESTokenTypes
):
self.bond_idx = bond_idx
self.start_idx = start_idx
self.end_idx = end_idx
self.token_type = token_type
def extract_bond_char(self, smiles):
return None if (self.bond_idx is None) else smiles[self.bond_idx]
def extract_symbol(self, smiles):
return smiles[self.start_idx:self.end_idx]
def tokenize_smiles(smiles: str) -> Iterator[SMILESToken]:
"""Splits a SMILES string into its tokens.
:param smiles: the input SMILES string.
:return: the tokens of the input SMILES one-by-one with order preserved.
"""
i = 0
while i < len(smiles):
if smiles[i] == ".":
yield SMILESToken(None, i, i + 1, SMILESTokenTypes.DOT)
i += 1
continue
if smiles[i] in SMILES_BOND_ORDERS:
bond_idx = i
i += 1
else:
bond_idx = None
if i == len(smiles):
raise SMILESParserError(smiles, "hanging bond", i - 1)
elif smiles[i].isalpha(): # organic subset elements
if smiles[i: i + 2] in ("Br", "Cl"): # two-letter elements
token = SMILESToken(bond_idx, i, i + 2, SMILESTokenTypes.ATOM)
else: # one-letter elements (e.g. C, N, ...)
token = SMILESToken(bond_idx, i, i + 1, SMILESTokenTypes.ATOM)
elif smiles[i] == "[": # atoms encased in brackets (e.g. [NH])
r_idx = smiles.find("]", i + 1)
if r_idx == -1:
raise SMILESParserError(smiles, "hanging bracket [", i)
token = SMILESToken(bond_idx, i, r_idx + 1, SMILESTokenTypes.ATOM)
elif smiles[i] in ("(", ")"): # open and closed branch brackets
if bond_idx is not None:
raise SMILESParserError(smiles, "hanging_bond", bond_idx)
token = SMILESToken(None, i, i + 1, SMILESTokenTypes.BRANCH)
elif smiles[i].isdigit(): # one-digit ring number
token = SMILESToken(bond_idx, i, i + 1, SMILESTokenTypes.RING)
elif smiles[i] == "%": # two-digit ring number (e.g. %12)
rnum = smiles[i + 1: i + 3]
if not (rnum.isnumeric() and len(rnum) == 2):
err_msg = "invalid ring number '%{}'".format(rnum)
raise SMILESParserError(smiles, err_msg, i)
token = SMILESToken(bond_idx, i, i + 3, SMILESTokenTypes.RING)
else:
err_msg = "unrecognized symbol '{}'".format(smiles[i])
raise SMILESParserError(smiles, err_msg, i)
yield token
i = token.end_idx
# =============================================================================
# SMILES -> Atom, Graph, etc.
# =============================================================================
def smiles_to_atom(atom_symbol: str) -> Optional[Atom]:
"""Reads an atom from its SMILES representation.
:param atom_symbol: a SMILES atom symbol.
:return: the atom that the input symbol represents.
"""
if atom_symbol[0] == "[" and atom_symbol[-1] == "]":
pass # continue below
elif atom_symbol in ORGANIC_SUBSET: # e.g. C, N, O, ...
return Atom(atom_symbol, False)
elif atom_symbol in AROMATIC_SUBSET: # e.g. c, n, o, ...
return Atom(atom_symbol.capitalize(), True)
else:
return None
# e.g. [C], [C@@H], [O-], ...
m = SMILES_BRACKETED_ATOM_PATTERN.match(atom_symbol)
if m is None:
return None
isotope, element, chirality, h_count, charge, _ = m.groups()
isotope = None if (isotope == "") else int(isotope)
is_aromatic = element.islower() and (element in AROMATIC_SUBSET)
element = element.capitalize()
if element not in ELEMENTS:
return None
chirality = None if (chirality == "") else chirality
s = h_count
if s == "":
h_count = 0
else:
s = s[1:] # HXXX -> XXX
h_count = 1 if (s == "") else int(s)
s = charge
if s == "":
charge = 0
else:
if s[-1].isdigit(): # (+/-)XXX
charge = int(s[1:])
else: # +++... or ---....
charge = len(s)
charge *= 1 if s[0] == "+" else -1
return Atom(
element=element,
is_aromatic=is_aromatic,
isotope=isotope,
chirality=chirality,
h_count=h_count,
charge=charge
)
def smiles_to_bond(
bond_char: Optional[str]
) -> Tuple[Union[int, float], Optional[str]]:
"""Reads a bond from its SMILES representation.
:param bond_char: a SMILES bond symbol.
:return: the order and stereochemical specification of the bond
that the input symbol represents.
"""
order = SMILES_BOND_ORDERS.get(bond_char, 1)
stereo = bond_char if (bond_char in SMILES_STEREO_BONDS) else None
return order, stereo
def smiles_to_mol(smiles: str) -> MolecularGraph:
"""Reads a molecular graph from a SMILES string.
:param smiles: the input SMILES string.
:return: a molecular graph that the input SMILES string represents.
:raises SMILESParserError: if the input SMILES is invalid.
"""
if smiles == "":
raise SMILESParserError(smiles, "empty SMILES", 0)
mol = MolecularGraph()
tokens = deque(tokenize_smiles(smiles))
while tokens:
_derive_mol_from_tokens(mol, smiles, tokens)
return mol
def _derive_mol_from_tokens(mol, smiles, tokens):
tok = None
prev_stack = deque() # keep track of previous atom on the current chain
branch_stack = deque() # keep track of open branches
ring_log = dict() # keep track of hanging ring numbers
chain_start = True
prev_stack.append(tok)
while tokens:
tok = tokens.popleft()
bond_char = tok.extract_bond_char(smiles)
symbol, symbol_type = tok.extract_symbol(smiles), tok.token_type
prev_atom = prev_stack[-1]
if symbol_type == SMILESTokenTypes.DOT:
break
elif symbol_type == SMILESTokenTypes.ATOM:
curr = smiles_to_atom(symbol)
if curr is None:
err_msg = "invalid atom symbol '{}'".format(symbol)
raise SMILESParserError(smiles, err_msg, tok.start_idx)
curr = _attach_atom(mol, bond_char, curr, prev_atom)
prev_stack.pop()
prev_stack.append(curr)
chain_start = False
elif chain_start:
err_msg = "SMILES chain begins with non-atom"
raise SMILESParserError(smiles, err_msg, tok.start_idx)
elif symbol_type == SMILESTokenTypes.BRANCH:
if symbol == "(":
branch_stack.append(tok)
prev_stack.append(prev_atom)
chain_start = True
else:
if not branch_stack:
err_msg = "hanging ')' bracket"
raise SMILESParserError(smiles, err_msg, tok.start_idx)
branch_stack.pop()
prev_stack.pop()
elif symbol_type == SMILESTokenTypes.RING:
if symbol not in ring_log:
lpos = mol.add_placeholder_bond(src=prev_atom.index)
ring_log[symbol] = (tok, prev_atom, lpos)
else:
ltoken, latom, lpos = ring_log.pop(symbol)
_make_ring_bonds(
mol=mol, smiles=smiles,
ltoken=ltoken, latom=latom, lpos=lpos,
rtoken=tok, ratom=prev_atom
)
else:
# should not happen
raise Exception("invalid symbol type")
if len(mol) == 0:
err_idx = (len(smiles) if (tok is None) else tok.start_idx) - 1
raise SMILESParserError(smiles, "empty SMILES fragment", err_idx)
if branch_stack:
err_idx = branch_stack[-1].start_idx
raise SMILESParserError(smiles, "hanging '(' bracket", err_idx)
if ring_log:
rnum, (tok, _, _) = list(ring_log.items())[-1]
err_msg = "hanging ring number '{}'".format(rnum)
raise SMILESParserError(smiles, err_msg, tok.start_idx)
def _attach_atom(mol, bond_char, atom, prev_atom):
is_root = (prev_atom is None)
mol.add_atom(atom, mark_root=is_root)
if not is_root:
src, dst = prev_atom.index, atom.index
order, stereo = smiles_to_bond(bond_char)
if prev_atom.is_aromatic and atom.is_aromatic and (bond_char is None):
order = 1.5 # handle implicit aromatic bonds, e.g. cc
mol.add_bond(src=src, dst=dst, order=order, stereo=stereo)
return atom
def _make_ring_bonds(mol, smiles, ltoken, latom, lpos, rtoken, ratom):
if mol.has_bond(latom.index, ratom.index):
err_msg = "ring bond specified between already-bonded atoms"
raise SMILESParserError(smiles, err_msg, ltoken.start_idx)
lbond_char = ltoken.extract_bond_char(smiles)
rbond_char = rtoken.extract_bond_char(smiles)
# checking that ring bonds match
bonds = (lbond_char, rbond_char)
if bonds[0] is None:
bonds = (bonds[1], bonds[0])
# swap bonds so that if bonds[0] is None, then bonds[1] is None
if ((bonds[0] == bonds[1])
or (bonds[1] is None)
or all(x in SMILES_STEREO_BONDS for x in bonds)):
pass
else:
err_msg = "mismatched ring bonds"
raise SMILESParserError(smiles, err_msg, ltoken.start_idx)
lorder, lstereo = smiles_to_bond(lbond_char)
rorder, rstereo = smiles_to_bond(rbond_char)
if latom.is_aromatic and ratom.is_aromatic and (bonds == (None, None)):
lorder = rorder = 1.5 # handle implicit aromatic bonds, e.g. c1ccccc1
mol.add_ring_bond(
a=latom.index, a_stereo=lstereo, a_pos=lpos,
b=ratom.index, b_stereo=rstereo,
order=max(lorder, rorder)
)
# =============================================================================
# SMILES <- Atom, Graph, etc.
# =============================================================================
def atom_to_smiles(atom: Atom, brackets: bool = True) -> str:
"""Converts an atom into its SMILES representation.
:param atom: the input atom.
:param brackets: True, if brackets should be added around the returned
symbol (e.g. in the case of [C] or [C@@H]). Defaults to True.
:return: a SMILES symbol representing the input atom.
"""
assert not atom.is_aromatic
specs = (atom.isotope, atom.chirality, atom.h_count, atom.charge)
if specs == (None, None, None, 0):
return atom.element
else:
builder = []
if brackets:
builder.append("[")
if atom.isotope is not None:
builder.append(str(atom.isotope))
builder.append(atom.element)
if atom.chirality is not None:
builder.append(atom.chirality)
if atom.h_count != 0:
builder.append("H")
builder.append(str(atom.h_count))
elif specs == (None, None, 0, 0) and (atom.element in ORGANIC_SUBSET):
builder.append("H0")
if atom.charge != 0:
builder.append("{:+}".format(atom.charge))
if brackets:
builder.append("]")
return "".join(builder)
def bond_to_smiles(bond: DirectedBond) -> str:
"""Converts a bond into its SMILES representation.
:param bond: the input bond.
:return: a SMILES symbol representing the input bond.
"""
if bond.order == 1:
return bond.stereo if (bond.stereo in SMILES_STEREO_BONDS) else ""
elif bond.order == 2:
return "="
elif bond.order == 3:
return "#"
else: # this should never happen
raise ValueError()
def mol_to_smiles(mol: MolecularGraph) -> str:
"""Converts a molecular graph into its SMILES representation, maintaining
the traversal order indicated by the input graph.
:param mol: the input molecule.
:return: a SMILES string representing the input molecule.
"""
assert mol.is_kekulized()
fragments = []
ring_log = dict()
for root in mol.get_roots():
derived = []
_derive_smiles_from_fragment(derived, mol, root, ring_log)
fragments.append("".join(derived))
return ".".join(fragments)
def _derive_smiles_from_fragment(derived, mol, root, ring_log):
curr_atom, curr = mol.get_atom(root), root
derived.append(atom_to_smiles(curr_atom))
out_bonds = mol.get_out_dirbonds(curr)
for i, bond in enumerate(out_bonds):
if bond.ring_bond:
derived.append(bond_to_smiles(bond))
ends = (min(bond.src, bond.dst), max(bond.src, bond.dst))
rnum = ring_log.setdefault(ends, len(ring_log) + 1)
if rnum >= 10:
derived.append("%")
derived.append(str(rnum))
else:
if i < len(out_bonds) - 1:
derived.append("(")
derived.append(bond_to_smiles(bond))
_derive_smiles_from_fragment(derived, mol, bond.dst, ring_log)
if i < len(out_bonds) - 1:
derived.append(")")
+89
-98
Metadata-Version: 2.1
Name: selfies
Version: 1.0.4
Version: 2.0.0
Summary: SELFIES (SELF-referencIng Embedded Strings) is a general-purpose, sequence-based, robust representation of semantically constrained graphs.
Home-page: https://github.com/aspuru-guzik-group/selfies
Author: Mario Krenn
Author: Mario Krenn, Alston Lo, and many other contributors
Author-email: mario.krenn@utoronto.ca, alan@aspuru.com

@@ -16,20 +16,24 @@ License: UNKNOWN

[![GitHub issues](https://img.shields.io/github/issues/aspuru-guzik-group/selfies.svg)](https://GitHub.com/aspuru-guzik-group/selfies/issues/)
[![Documentation Status](https://readthedocs.org/projects/selfies/badge/?version=latest)](http://selfies.readthedocs.io/?badge=latest)
[![Documentation Status](https://readthedocs.org/projects/selfiesv2/badge/?version=latest)](http://selfiesv2.readthedocs.io/?badge=latest)
[![GitHub contributors](https://img.shields.io/github/contributors/aspuru-guzik-group/selfies.svg)](https://GitHub.com/aspuru-guzik-group/selfies/graphs/contributors/)
**Self-Referencing Embedded Strings (SELFIES): A 100% robust molecular string representation**<br>
_Mario Krenn, Florian Haese, AkshatKumar Nigam, Pascal Friederich, Alan Aspuru-Guzik_<br>
[*Machine Learning: Science and Technology* **1**, 045024 (2020)](https://iopscience.iop.org/article/10.1088/2632-2153/aba947), [extensive blog post January 2021](https://aspuru.substack.com/p/molecular-graph-representations-and).<br>
[Talk on youtube about SELFIES](https://www.youtube.com/watch?v=CaIyUmfGXDk).<br>
Major contributors since v1.0.0: _[Alston Lo](https://github.com/aspuru-guzik-group/selfies/commits?author=alstonlo) and [Seyone Chithrananda](https://github.com/seyonechithrananda)_<br>
**Self-Referencing Embedded Strings (SELFIES): A 100% robust molecular string representation**\
_Mario Krenn, Florian Haese, AkshatKumar Nigam, Pascal Friederich, Alan Aspuru-Guzik_\
[*Machine Learning: Science and Technology* **1**, 045024 (2020)](https://iopscience.iop.org/article/10.1088/2632-2153/aba947), [extensive blog post January 2021](https://aspuru.substack.com/p/molecular-graph-representations-and).\
[Talk on youtube about SELFIES](https://www.youtube.com/watch?v=CaIyUmfGXDk).\
[Blog explaining SELFIES in Japanese language](https://blacktanktop.hatenablog.com/entry/2021/08/12/115613)\
Major contributors since v1.0.0: _[Alston Lo](https://github.com/alstonlo) and [Seyone Chithrananda](https://github.com/seyonechithrananda)_\
Chemistry Advisor: [Robert Pollice](https://scholar.google.at/citations?user=JR2N3JIAAAAJ)
A main objective is to use SELFIES as direct input into machine learning models,<br>
in particular in generative models, for the generation of molecular graphs<br>
---
A main objective is to use SELFIES as direct input into machine learning models,
in particular in generative models, for the generation of molecular graphs
which are syntactically and semantically valid.
<center><img src="https://github.com/aspuru-guzik-group/selfies/blob/master/examples/VAE_LS_Validity.png" alt="SELFIES validity in a VAE latent space" width="666px"></center>
<p align="center">
<img src="https://github.com/aspuru-guzik-group/selfies/blob/master/examples/VAE_LS_Validity.png" alt="SELFIES validity in a VAE latent space" width="666px">
</p>
## Installation

@@ -52,3 +56,3 @@ Use pip to install ``selfies``.

[CHANGELOG](https://github.com/aspuru-guzik-group/selfies/blob/master/CHANGELOG.md)
to review the changes between versions of `selfies`:
to review the changes between versions of `selfies`, before upgrading:

@@ -59,28 +63,23 @@ ```bash

## Documentation
The documentation can be found on
[ReadTheDocs](https://selfies.readthedocs.io/en/latest/).
Alternatively, it can be built from the ``docs/`` directory.
## Usage
### Standard Functions
### Overview
The ``selfies`` library has eight standard functions:
Please refer to the [documentation](https://selfiesv2.readthedocs.io/en/latest/),
which contains a thorough tutorial for getting started with ``selfies``
and detailed descriptions of the functions
that ``selfies`` provides. We summarize some key functions below.
| Function | Description |
| -------- | ----------- |
| ``selfies.encoder`` | Translates a SMILES into an equivalent SELFIES. |
| ``selfies.decoder`` | Translates a SELFIES into an equivalent SMILES. |
| ``selfies.len_selfies`` | Returns the (symbol) length of a SELFIES. |
| ``selfies.split_selfies`` | Splits a SELFIES into its symbols. |
| ``selfies.get_alphabet_from_selfies`` | Builds an alphabet of SELFIES symbols from an iterable of SELFIES. |
| ``selfies.get_semantic_robust_alphabet`` | Returns a subset of all SELFIES symbols that are semantically constrained. |
| ``selfies.selfies_to_encoding`` | Converts a SELFIES into a label and/or one-hot encoding. |
| ``selfies.encoding_to_selfies`` | Converts a label or one-hot encoding into a SELFIES. |
| ``selfies.encoder`` | Translates a SMILES string into its corresponding SELFIES string. |
| ``selfies.decoder`` | Translates a SELFIES string into its corresponding SMILES string. |
| ``selfies.set_semantic_constraints`` | Configures the semantic constraints that ``selfies`` operates on. |
| ``selfies.len_selfies`` | Returns the number of symbols in a SELFIES string. |
| ``selfies.split_selfies`` | Tokenizes a SELFIES string into its individual symbols. |
| ``selfies.get_alphabet_from_selfies`` | Constructs an alphabet from an iterable of SELFIES strings. |
| ``selfies.selfies_to_encoding`` | Converts a SELFIES string into its label and/or one-hot encoding. |
| ``selfies.encoding_to_selfies`` | Converts a label or one-hot encoding into a SELFIES string. |
Please read the documentation for more detailed descriptions of these
functions, and to view the advanced functions, which allow users to
customize the SELFIES language.

@@ -96,19 +95,41 @@ ### Examples

# SMILES --> SELFIES translation
encoded_selfies = sf.encoder(benzene) # '[C][=C][C][=C][C][=C][Ring1][Branch1_2]'
# SMILES -> SELFIES -> SMILES translation
try:
benzene_sf = sf.encoder(benzene) # [C][=C][C][=C][C][=C][Ring1][=Branch1]
benzene_smi = sf.decoder(benzene_sf) # C1=CC=CC=C1
except sf.EncoderError:
pass # sf.encoder error!
except sf.DecoderError:
pass # sf.decoder error!
# SELFIES --> SMILES translation
decoded_smiles = sf.decoder(encoded_selfies) # 'C1=CC=CC=C1'
len_benzene = sf.len_selfies(benzene_sf) # 8
len_benzene = sf.len_selfies(encoded_selfies) # 8
symbols_benzene = list(sf.split_selfies(benzene_sf))
# ['[C]', '[=C]', '[C]', '[=C]', '[C]', '[=C]', '[Ring1]', '[=Branch1]']
```
symbols_benzene = list(sf.split_selfies(encoded_selfies))
# ['[C]', '[=C]', '[C]', '[=C]', '[C]', '[=C]', '[Ring1]', '[Branch1_2]']
#### Customizing SELFIES:
In this example, we relax the semantic constraints of ``selfies`` to allow
for hypervalences (caution: hypervalence rules are much less understood
than octet rules. Some molecules containing hypervalences are important,
but generally, it is not known which molecules are stable and reasonable).
```python
import selfies as sf
hypervalent_sf = sf.encoder('O=I(O)(O)(O)(O)O', strict=False) # orthoperiodic acid
standard_derived_smi = sf.decoder(hypervalent_sf)
# OI (the default constraints for I allows for only 1 bond)
sf.set_semantic_constraints("hypervalent")
relaxed_derived_smi = sf.decoder(hypervalent_sf)
# O=I(O)(O)(O)(O)O (the hypervalent constraints for I allows for 7 bonds)
```
#### Integer and one-hot encoding SELFIES:
In this example we first build an alphabet
from a dataset of SELFIES, and then convert a SELFIES into a
padded, label-encoded representation. Note that we use the
``'[nop]'`` ([no operation](https://en.wikipedia.org/wiki/NOP_(code) ))
In this example, we first build an alphabet from a dataset of SELFIES strings,
and then convert a SELFIES string into its padded encoding. Note that we use the
``[nop]`` ([no operation](https://en.wikipedia.org/wiki/NOP_(code) ))
symbol to pad our SELFIES, which is a special SELFIES symbol that is always

@@ -121,7 +142,6 @@ ignored and skipped over by ``selfies.decoder``, making it a useful

dataset = ['[C][O][C]', '[F][C][F]', '[O][=O]', '[C][C][O][C][C]']
dataset = ["[C][O][C]", "[F][C][F]", "[O][=O]", "[C][C][O][C][C]"]
alphabet = sf.get_alphabet_from_selfies(dataset)
alphabet.add('[nop]') # '[nop]' is a special padding symbol
alphabet = list(sorted(alphabet))
print(alphabet) # ['[=O]', '[C]', '[F]', '[O]', '[nop]']
alphabet.add("[nop]") # [nop] is a special padding symbol
alphabet = list(sorted(alphabet)) # ['[=O]', '[C]', '[F]', '[O]', '[nop]']

@@ -131,19 +151,15 @@ pad_to_len = max(sf.len_selfies(s) for s in dataset) # 5

# SELFIES to label encode
dimethyl_ether = dataset[0] # '[C][O][C]'
dimethyl_ether = dataset[0] # [C][O][C]
# [1, 3, 1, 4, 4]
print(sf.selfies_to_encoding(dimethyl_ether,
vocab_stoi=symbol_to_idx,
pad_to_len=pad_to_len,
enc_type='label'))
# [[0, 1, 0, 0, 0], [0, 0, 0, 1, 0], [0, 1, 0, 0, 0], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1]]
print(sf.selfies_to_encoding(dimethyl_ether,
vocab_stoi=symbol_to_idx,
pad_to_len=pad_to_len,
enc_type='one_hot'))
label, one_hot = sf.selfies_to_encoding(
selfies=dimethyl_ether,
vocab_stoi=symbol_to_idx,
pad_to_len=pad_to_len,
enc_type="both"
)
# label = [1, 3, 1, 4, 4]
# one_hot = [[0, 1, 0, 0, 0], [0, 0, 0, 1, 0], [0, 1, 0, 0, 0], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1]]
```
### More Examples
### More Usages and Examples

@@ -158,19 +174,6 @@ * More examples can be found in the ``examples/`` directory, including a

* Kohulan Rajan, Achim Zielesny, Christoph Steinbeck show in two papers that SELFIES outperforms other representations in [img2string](https://link.springer.com/article/10.1186/s13321-020-00469-w) and [string2string](https://chemrxiv.org/articles/preprint/STOUT_SMILES_to_IUPAC_Names_Using_Neural_Machine_Translation/13469202/1) translation tasks, see the codes of [DECIMER](https://github.com/Kohulan/DECIMER-Image-to-SMILES) and [STOUT](https://github.com/Kohulan/Smiles-TO-iUpac-Translator).
* An improvement to the old genetic algorithm, the authors have also released [JANUS](https://arxiv.org/abs/2106.04011), which allows for more efficient optimization in the chemical space. JANUS makes use of [STONED-SELFIES](https://pubs.rsc.org/en/content/articlepdf/2021/sc/d1sc00231g) and a neural network for efficient sampling.
## Handling invalid inputs
If an invalid input is presented to the encoder or decoder, the return value is `None`.
The error can be analysed by using the `encoder(...,print_error=True)` option.
```python
import selfies as sf
invalid_smiles="C[C@H](O)[C@@(*)C1=CC=CC=C1"
selfies_string=sf.encoder(invalid_smiles)
if selfies_string==None:
selfies_string=sf.encoder(invalid_smiles,print_error=True)
# 'Encoding error 'C[C@H](O)[C@@(*)C1=CC=CC=C1': wildcard atom '*' not supported.'
```
## Tests
SELFIES uses `pytest` with `tox` as its testing framework.
`selfies` uses `pytest` with `tox` as its testing framework.
All tests can be found in the `tests/` directory. To run the test suite for

@@ -180,29 +183,17 @@ SELFIES, install ``tox`` and run:

```bash
tox
tox -- --trials=10000 --dataset_samples=10000
```
By default, SELFIES is tested against a random subset
(of size ``dataset_samples=100000``) on various datasets:
By default, `selfies` is tested against a random subset
(of size ``dataset_samples=10000``) on various datasets:
* 130K molecules from [QM9](https://www.nature.com/articles/sdata201422)
* 250K molecules from [ZINC](https://en.wikipedia.org/wiki/ZINC_database)
* 50K molecules from [non-fullerene acceptors for organic solar cells](https://www.sciencedirect.com/science/article/pii/S2542435117301307)
* 8K molecules from [Tox21](http://moleculenet.ai/datasets-1) in MoleculeNet
* 93K molecules from PubChem [MUV](http://moleculenet.ai/datasets-1) in MoleculeNet
* 27M molecules from the [eMolecules Plus Database](https://www.emolecules.com/info/plus/download-database).
* 50K molecules from a dataset of [non-fullerene acceptors for organic solar cells](https://www.sciencedirect.com/science/article/pii/S2542435117301307)
* 160K+ molecules from various [MoleculeNet](http://moleculenet.ai/datasets-1) datasets
* 36M+ molecules from the [eMolecules Database](https://www.emolecules.com/info/products-data-downloads.html).
Due to its large size, this dataset is not included on the repository. To run tests
on it, please download the dataset in the ``tests/test_sets`` directory
and enable its pytest at ``tests/test_on_emolecules.py``.
on it, please download the dataset into the ``tests/test_sets`` directory
and run the ``tests/run_on_large_dataset.py`` script.
Other tests are random and repeated ``trials`` number of times.
These can be specified as arguments
```bash
tox -- --trials 100 --dataset_samples 100
```
where ``--trials=100000`` and ``--dataset_samples=100000`` by default. Note that
if ``dataset_samples`` is negative or exceeds the length of the dataset,
the whole dataset is used.
## Version History

@@ -213,5 +204,5 @@ See [CHANGELOG](https://github.com/aspuru-guzik-group/selfies/blob/master/CHANGELOG.md).

We thank Jacques Boitreaud, Andrew Brereton, Matthew Carbone (x94carbone), Nathan Frey (ncfrey), Theophile Gaudin,
HelloJocelynLu, Hyunmin Kim (hmkim), Minjie Li, Vincent Mallet, Alexander Minidis (DocMinus), Kevin Ryan (LeanAndMean),
Benjamin Sanchez-Lengeling, and Zhenpeng Yao for their suggestions and bug reports,
We thank Jacques Boitreaud, Andrew Brereton, Nessa Carson (supersciencegrl), Matthew Carbone (x94carbone), Vladimir Chupakhin (chupvl), Nathan Frey (ncfrey), Theophile Gaudin,
HelloJocelynLu, Hyunmin Kim (hmkim), Minjie Li, Vincent Mallet, Alexander Minidis (DocMinus), Kohulan Rajan (Kohulan),
Kevin Ryan (LeanAndMean), Benjamin Sanchez-Lengeling, Andrew White, Zhenpeng Yao and Adamo Young for their suggestions and bug reports,
and Robert Pollice for chemistry advices.

@@ -218,0 +209,0 @@

+87
-96

@@ -8,20 +8,24 @@ # SELFIES

[![GitHub issues](https://img.shields.io/github/issues/aspuru-guzik-group/selfies.svg)](https://GitHub.com/aspuru-guzik-group/selfies/issues/)
[![Documentation Status](https://readthedocs.org/projects/selfies/badge/?version=latest)](http://selfies.readthedocs.io/?badge=latest)
[![Documentation Status](https://readthedocs.org/projects/selfiesv2/badge/?version=latest)](http://selfiesv2.readthedocs.io/?badge=latest)
[![GitHub contributors](https://img.shields.io/github/contributors/aspuru-guzik-group/selfies.svg)](https://GitHub.com/aspuru-guzik-group/selfies/graphs/contributors/)
**Self-Referencing Embedded Strings (SELFIES): A 100% robust molecular string representation**<br>
_Mario Krenn, Florian Haese, AkshatKumar Nigam, Pascal Friederich, Alan Aspuru-Guzik_<br>
[*Machine Learning: Science and Technology* **1**, 045024 (2020)](https://iopscience.iop.org/article/10.1088/2632-2153/aba947), [extensive blog post January 2021](https://aspuru.substack.com/p/molecular-graph-representations-and).<br>
[Talk on youtube about SELFIES](https://www.youtube.com/watch?v=CaIyUmfGXDk).<br>
Major contributors since v1.0.0: _[Alston Lo](https://github.com/aspuru-guzik-group/selfies/commits?author=alstonlo) and [Seyone Chithrananda](https://github.com/seyonechithrananda)_<br>
**Self-Referencing Embedded Strings (SELFIES): A 100% robust molecular string representation**\
_Mario Krenn, Florian Haese, AkshatKumar Nigam, Pascal Friederich, Alan Aspuru-Guzik_\
[*Machine Learning: Science and Technology* **1**, 045024 (2020)](https://iopscience.iop.org/article/10.1088/2632-2153/aba947), [extensive blog post January 2021](https://aspuru.substack.com/p/molecular-graph-representations-and).\
[Talk on youtube about SELFIES](https://www.youtube.com/watch?v=CaIyUmfGXDk).\
[Blog explaining SELFIES in Japanese language](https://blacktanktop.hatenablog.com/entry/2021/08/12/115613)\
Major contributors since v1.0.0: _[Alston Lo](https://github.com/alstonlo) and [Seyone Chithrananda](https://github.com/seyonechithrananda)_\
Chemistry Advisor: [Robert Pollice](https://scholar.google.at/citations?user=JR2N3JIAAAAJ)
A main objective is to use SELFIES as direct input into machine learning models,<br>
in particular in generative models, for the generation of molecular graphs<br>
---
A main objective is to use SELFIES as direct input into machine learning models,
in particular in generative models, for the generation of molecular graphs
which are syntactically and semantically valid.
<center><img src="https://github.com/aspuru-guzik-group/selfies/blob/master/examples/VAE_LS_Validity.png" alt="SELFIES validity in a VAE latent space" width="666px"></center>
<p align="center">
<img src="https://github.com/aspuru-guzik-group/selfies/blob/master/examples/VAE_LS_Validity.png" alt="SELFIES validity in a VAE latent space" width="666px">
</p>
## Installation

@@ -44,3 +48,3 @@ Use pip to install ``selfies``.

[CHANGELOG](https://github.com/aspuru-guzik-group/selfies/blob/master/CHANGELOG.md)
to review the changes between versions of `selfies`:
to review the changes between versions of `selfies`, before upgrading:

@@ -51,28 +55,23 @@ ```bash

## Documentation
The documentation can be found on
[ReadTheDocs](https://selfies.readthedocs.io/en/latest/).
Alternatively, it can be built from the ``docs/`` directory.
## Usage
### Standard Functions
### Overview
The ``selfies`` library has eight standard functions:
Please refer to the [documentation](https://selfiesv2.readthedocs.io/en/latest/),
which contains a thorough tutorial for getting started with ``selfies``
and detailed descriptions of the functions
that ``selfies`` provides. We summarize some key functions below.
| Function | Description |
| -------- | ----------- |
| ``selfies.encoder`` | Translates a SMILES into an equivalent SELFIES. |
| ``selfies.decoder`` | Translates a SELFIES into an equivalent SMILES. |
| ``selfies.len_selfies`` | Returns the (symbol) length of a SELFIES. |
| ``selfies.split_selfies`` | Splits a SELFIES into its symbols. |
| ``selfies.get_alphabet_from_selfies`` | Builds an alphabet of SELFIES symbols from an iterable of SELFIES. |
| ``selfies.get_semantic_robust_alphabet`` | Returns a subset of all SELFIES symbols that are semantically constrained. |
| ``selfies.selfies_to_encoding`` | Converts a SELFIES into a label and/or one-hot encoding. |
| ``selfies.encoding_to_selfies`` | Converts a label or one-hot encoding into a SELFIES. |
| ``selfies.encoder`` | Translates a SMILES string into its corresponding SELFIES string. |
| ``selfies.decoder`` | Translates a SELFIES string into its corresponding SMILES string. |
| ``selfies.set_semantic_constraints`` | Configures the semantic constraints that ``selfies`` operates on. |
| ``selfies.len_selfies`` | Returns the number of symbols in a SELFIES string. |
| ``selfies.split_selfies`` | Tokenizes a SELFIES string into its individual symbols. |
| ``selfies.get_alphabet_from_selfies`` | Constructs an alphabet from an iterable of SELFIES strings. |
| ``selfies.selfies_to_encoding`` | Converts a SELFIES string into its label and/or one-hot encoding. |
| ``selfies.encoding_to_selfies`` | Converts a label or one-hot encoding into a SELFIES string. |
Please read the documentation for more detailed descriptions of these
functions, and to view the advanced functions, which allow users to
customize the SELFIES language.

@@ -88,19 +87,41 @@ ### Examples

# SMILES --> SELFIES translation
encoded_selfies = sf.encoder(benzene) # '[C][=C][C][=C][C][=C][Ring1][Branch1_2]'
# SMILES -> SELFIES -> SMILES translation
try:
benzene_sf = sf.encoder(benzene) # [C][=C][C][=C][C][=C][Ring1][=Branch1]
benzene_smi = sf.decoder(benzene_sf) # C1=CC=CC=C1
except sf.EncoderError:
pass # sf.encoder error!
except sf.DecoderError:
pass # sf.decoder error!
# SELFIES --> SMILES translation
decoded_smiles = sf.decoder(encoded_selfies) # 'C1=CC=CC=C1'
len_benzene = sf.len_selfies(benzene_sf) # 8
len_benzene = sf.len_selfies(encoded_selfies) # 8
symbols_benzene = list(sf.split_selfies(benzene_sf))
# ['[C]', '[=C]', '[C]', '[=C]', '[C]', '[=C]', '[Ring1]', '[=Branch1]']
```
symbols_benzene = list(sf.split_selfies(encoded_selfies))
# ['[C]', '[=C]', '[C]', '[=C]', '[C]', '[=C]', '[Ring1]', '[Branch1_2]']
#### Customizing SELFIES:
In this example, we relax the semantic constraints of ``selfies`` to allow
for hypervalences (caution: hypervalence rules are much less understood
than octet rules. Some molecules containing hypervalences are important,
but generally, it is not known which molecules are stable and reasonable).
```python
import selfies as sf
hypervalent_sf = sf.encoder('O=I(O)(O)(O)(O)O', strict=False) # orthoperiodic acid
standard_derived_smi = sf.decoder(hypervalent_sf)
# OI (the default constraints for I allows for only 1 bond)
sf.set_semantic_constraints("hypervalent")
relaxed_derived_smi = sf.decoder(hypervalent_sf)
# O=I(O)(O)(O)(O)O (the hypervalent constraints for I allows for 7 bonds)
```
#### Integer and one-hot encoding SELFIES:
In this example we first build an alphabet
from a dataset of SELFIES, and then convert a SELFIES into a
padded, label-encoded representation. Note that we use the
``'[nop]'`` ([no operation](https://en.wikipedia.org/wiki/NOP_(code) ))
In this example, we first build an alphabet from a dataset of SELFIES strings,
and then convert a SELFIES string into its padded encoding. Note that we use the
``[nop]`` ([no operation](https://en.wikipedia.org/wiki/NOP_(code) ))
symbol to pad our SELFIES, which is a special SELFIES symbol that is always

@@ -113,7 +134,6 @@ ignored and skipped over by ``selfies.decoder``, making it a useful

dataset = ['[C][O][C]', '[F][C][F]', '[O][=O]', '[C][C][O][C][C]']
dataset = ["[C][O][C]", "[F][C][F]", "[O][=O]", "[C][C][O][C][C]"]
alphabet = sf.get_alphabet_from_selfies(dataset)
alphabet.add('[nop]') # '[nop]' is a special padding symbol
alphabet = list(sorted(alphabet))
print(alphabet) # ['[=O]', '[C]', '[F]', '[O]', '[nop]']
alphabet.add("[nop]") # [nop] is a special padding symbol
alphabet = list(sorted(alphabet)) # ['[=O]', '[C]', '[F]', '[O]', '[nop]']

@@ -123,19 +143,15 @@ pad_to_len = max(sf.len_selfies(s) for s in dataset) # 5

# SELFIES to label encode
dimethyl_ether = dataset[0] # '[C][O][C]'
dimethyl_ether = dataset[0] # [C][O][C]
# [1, 3, 1, 4, 4]
print(sf.selfies_to_encoding(dimethyl_ether,
vocab_stoi=symbol_to_idx,
pad_to_len=pad_to_len,
enc_type='label'))
# [[0, 1, 0, 0, 0], [0, 0, 0, 1, 0], [0, 1, 0, 0, 0], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1]]
print(sf.selfies_to_encoding(dimethyl_ether,
vocab_stoi=symbol_to_idx,
pad_to_len=pad_to_len,
enc_type='one_hot'))
label, one_hot = sf.selfies_to_encoding(
selfies=dimethyl_ether,
vocab_stoi=symbol_to_idx,
pad_to_len=pad_to_len,
enc_type="both"
)
# label = [1, 3, 1, 4, 4]
# one_hot = [[0, 1, 0, 0, 0], [0, 0, 0, 1, 0], [0, 1, 0, 0, 0], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1]]
```
### More Examples
### More Usages and Examples

@@ -150,19 +166,6 @@ * More examples can be found in the ``examples/`` directory, including a

* Kohulan Rajan, Achim Zielesny, Christoph Steinbeck show in two papers that SELFIES outperforms other representations in [img2string](https://link.springer.com/article/10.1186/s13321-020-00469-w) and [string2string](https://chemrxiv.org/articles/preprint/STOUT_SMILES_to_IUPAC_Names_Using_Neural_Machine_Translation/13469202/1) translation tasks, see the codes of [DECIMER](https://github.com/Kohulan/DECIMER-Image-to-SMILES) and [STOUT](https://github.com/Kohulan/Smiles-TO-iUpac-Translator).
* An improvement to the old genetic algorithm, the authors have also released [JANUS](https://arxiv.org/abs/2106.04011), which allows for more efficient optimization in the chemical space. JANUS makes use of [STONED-SELFIES](https://pubs.rsc.org/en/content/articlepdf/2021/sc/d1sc00231g) and a neural network for efficient sampling.
## Handling invalid inputs
If an invalid input is presented to the encoder or decoder, the return value is `None`.
The error can be analysed by using the `encoder(...,print_error=True)` option.
```python
import selfies as sf
invalid_smiles="C[C@H](O)[C@@(*)C1=CC=CC=C1"
selfies_string=sf.encoder(invalid_smiles)
if selfies_string==None:
selfies_string=sf.encoder(invalid_smiles,print_error=True)
# 'Encoding error 'C[C@H](O)[C@@(*)C1=CC=CC=C1': wildcard atom '*' not supported.'
```
## Tests
SELFIES uses `pytest` with `tox` as its testing framework.
`selfies` uses `pytest` with `tox` as its testing framework.
All tests can be found in the `tests/` directory. To run the test suite for

@@ -172,29 +175,17 @@ SELFIES, install ``tox`` and run:

```bash
tox
tox -- --trials=10000 --dataset_samples=10000
```
By default, SELFIES is tested against a random subset
(of size ``dataset_samples=100000``) on various datasets:
By default, `selfies` is tested against a random subset
(of size ``dataset_samples=10000``) on various datasets:
* 130K molecules from [QM9](https://www.nature.com/articles/sdata201422)
* 250K molecules from [ZINC](https://en.wikipedia.org/wiki/ZINC_database)
* 50K molecules from [non-fullerene acceptors for organic solar cells](https://www.sciencedirect.com/science/article/pii/S2542435117301307)
* 8K molecules from [Tox21](http://moleculenet.ai/datasets-1) in MoleculeNet
* 93K molecules from PubChem [MUV](http://moleculenet.ai/datasets-1) in MoleculeNet
* 27M molecules from the [eMolecules Plus Database](https://www.emolecules.com/info/plus/download-database).
* 50K molecules from a dataset of [non-fullerene acceptors for organic solar cells](https://www.sciencedirect.com/science/article/pii/S2542435117301307)
* 160K+ molecules from various [MoleculeNet](http://moleculenet.ai/datasets-1) datasets
* 36M+ molecules from the [eMolecules Database](https://www.emolecules.com/info/products-data-downloads.html).
Due to its large size, this dataset is not included on the repository. To run tests
on it, please download the dataset in the ``tests/test_sets`` directory
and enable its pytest at ``tests/test_on_emolecules.py``.
on it, please download the dataset into the ``tests/test_sets`` directory
and run the ``tests/run_on_large_dataset.py`` script.
Other tests are random and repeated ``trials`` number of times.
These can be specified as arguments
```bash
tox -- --trials 100 --dataset_samples 100
```
where ``--trials=100000`` and ``--dataset_samples=100000`` by default. Note that
if ``dataset_samples`` is negative or exceeds the length of the dataset,
the whole dataset is used.
## Version History

@@ -205,5 +196,5 @@ See [CHANGELOG](https://github.com/aspuru-guzik-group/selfies/blob/master/CHANGELOG.md).

We thank Jacques Boitreaud, Andrew Brereton, Matthew Carbone (x94carbone), Nathan Frey (ncfrey), Theophile Gaudin,
HelloJocelynLu, Hyunmin Kim (hmkim), Minjie Li, Vincent Mallet, Alexander Minidis (DocMinus), Kevin Ryan (LeanAndMean),
Benjamin Sanchez-Lengeling, and Zhenpeng Yao for their suggestions and bug reports,
We thank Jacques Boitreaud, Andrew Brereton, Nessa Carson (supersciencegrl), Matthew Carbone (x94carbone), Vladimir Chupakhin (chupvl), Nathan Frey (ncfrey), Theophile Gaudin,
HelloJocelynLu, Hyunmin Kim (hmkim), Minjie Li, Vincent Mallet, Alexander Minidis (DocMinus), Kohulan Rajan (Kohulan),
Kevin Ryan (LeanAndMean), Benjamin Sanchez-Lengeling, Andrew White, Zhenpeng Yao and Adamo Young for their suggestions and bug reports,
and Robert Pollice for chemistry advices.

@@ -210,0 +201,0 @@

Metadata-Version: 2.1
Name: selfies
Version: 1.0.4
Version: 2.0.0
Summary: SELFIES (SELF-referencIng Embedded Strings) is a general-purpose, sequence-based, robust representation of semantically constrained graphs.
Home-page: https://github.com/aspuru-guzik-group/selfies
Author: Mario Krenn
Author: Mario Krenn, Alston Lo, and many other contributors
Author-email: mario.krenn@utoronto.ca, alan@aspuru.com

@@ -16,20 +16,24 @@ License: UNKNOWN

[![GitHub issues](https://img.shields.io/github/issues/aspuru-guzik-group/selfies.svg)](https://GitHub.com/aspuru-guzik-group/selfies/issues/)
[![Documentation Status](https://readthedocs.org/projects/selfies/badge/?version=latest)](http://selfies.readthedocs.io/?badge=latest)
[![Documentation Status](https://readthedocs.org/projects/selfiesv2/badge/?version=latest)](http://selfiesv2.readthedocs.io/?badge=latest)
[![GitHub contributors](https://img.shields.io/github/contributors/aspuru-guzik-group/selfies.svg)](https://GitHub.com/aspuru-guzik-group/selfies/graphs/contributors/)
**Self-Referencing Embedded Strings (SELFIES): A 100% robust molecular string representation**<br>
_Mario Krenn, Florian Haese, AkshatKumar Nigam, Pascal Friederich, Alan Aspuru-Guzik_<br>
[*Machine Learning: Science and Technology* **1**, 045024 (2020)](https://iopscience.iop.org/article/10.1088/2632-2153/aba947), [extensive blog post January 2021](https://aspuru.substack.com/p/molecular-graph-representations-and).<br>
[Talk on youtube about SELFIES](https://www.youtube.com/watch?v=CaIyUmfGXDk).<br>
Major contributors since v1.0.0: _[Alston Lo](https://github.com/aspuru-guzik-group/selfies/commits?author=alstonlo) and [Seyone Chithrananda](https://github.com/seyonechithrananda)_<br>
**Self-Referencing Embedded Strings (SELFIES): A 100% robust molecular string representation**\
_Mario Krenn, Florian Haese, AkshatKumar Nigam, Pascal Friederich, Alan Aspuru-Guzik_\
[*Machine Learning: Science and Technology* **1**, 045024 (2020)](https://iopscience.iop.org/article/10.1088/2632-2153/aba947), [extensive blog post January 2021](https://aspuru.substack.com/p/molecular-graph-representations-and).\
[Talk on youtube about SELFIES](https://www.youtube.com/watch?v=CaIyUmfGXDk).\
[Blog explaining SELFIES in Japanese language](https://blacktanktop.hatenablog.com/entry/2021/08/12/115613)\
Major contributors since v1.0.0: _[Alston Lo](https://github.com/alstonlo) and [Seyone Chithrananda](https://github.com/seyonechithrananda)_\
Chemistry Advisor: [Robert Pollice](https://scholar.google.at/citations?user=JR2N3JIAAAAJ)
A main objective is to use SELFIES as direct input into machine learning models,<br>
in particular in generative models, for the generation of molecular graphs<br>
---
A main objective is to use SELFIES as direct input into machine learning models,
in particular in generative models, for the generation of molecular graphs
which are syntactically and semantically valid.
<center><img src="https://github.com/aspuru-guzik-group/selfies/blob/master/examples/VAE_LS_Validity.png" alt="SELFIES validity in a VAE latent space" width="666px"></center>
<p align="center">
<img src="https://github.com/aspuru-guzik-group/selfies/blob/master/examples/VAE_LS_Validity.png" alt="SELFIES validity in a VAE latent space" width="666px">
</p>
## Installation

@@ -52,3 +56,3 @@ Use pip to install ``selfies``.

[CHANGELOG](https://github.com/aspuru-guzik-group/selfies/blob/master/CHANGELOG.md)
to review the changes between versions of `selfies`:
to review the changes between versions of `selfies`, before upgrading:

@@ -59,28 +63,23 @@ ```bash

## Documentation
The documentation can be found on
[ReadTheDocs](https://selfies.readthedocs.io/en/latest/).
Alternatively, it can be built from the ``docs/`` directory.
## Usage
### Standard Functions
### Overview
The ``selfies`` library has eight standard functions:
Please refer to the [documentation](https://selfiesv2.readthedocs.io/en/latest/),
which contains a thorough tutorial for getting started with ``selfies``
and detailed descriptions of the functions
that ``selfies`` provides. We summarize some key functions below.
| Function | Description |
| -------- | ----------- |
| ``selfies.encoder`` | Translates a SMILES into an equivalent SELFIES. |
| ``selfies.decoder`` | Translates a SELFIES into an equivalent SMILES. |
| ``selfies.len_selfies`` | Returns the (symbol) length of a SELFIES. |
| ``selfies.split_selfies`` | Splits a SELFIES into its symbols. |
| ``selfies.get_alphabet_from_selfies`` | Builds an alphabet of SELFIES symbols from an iterable of SELFIES. |
| ``selfies.get_semantic_robust_alphabet`` | Returns a subset of all SELFIES symbols that are semantically constrained. |
| ``selfies.selfies_to_encoding`` | Converts a SELFIES into a label and/or one-hot encoding. |
| ``selfies.encoding_to_selfies`` | Converts a label or one-hot encoding into a SELFIES. |
| ``selfies.encoder`` | Translates a SMILES string into its corresponding SELFIES string. |
| ``selfies.decoder`` | Translates a SELFIES string into its corresponding SMILES string. |
| ``selfies.set_semantic_constraints`` | Configures the semantic constraints that ``selfies`` operates on. |
| ``selfies.len_selfies`` | Returns the number of symbols in a SELFIES string. |
| ``selfies.split_selfies`` | Tokenizes a SELFIES string into its individual symbols. |
| ``selfies.get_alphabet_from_selfies`` | Constructs an alphabet from an iterable of SELFIES strings. |
| ``selfies.selfies_to_encoding`` | Converts a SELFIES string into its label and/or one-hot encoding. |
| ``selfies.encoding_to_selfies`` | Converts a label or one-hot encoding into a SELFIES string. |
Please read the documentation for more detailed descriptions of these
functions, and to view the advanced functions, which allow users to
customize the SELFIES language.

@@ -96,19 +95,41 @@ ### Examples

# SMILES --> SELFIES translation
encoded_selfies = sf.encoder(benzene) # '[C][=C][C][=C][C][=C][Ring1][Branch1_2]'
# SMILES -> SELFIES -> SMILES translation
try:
benzene_sf = sf.encoder(benzene) # [C][=C][C][=C][C][=C][Ring1][=Branch1]
benzene_smi = sf.decoder(benzene_sf) # C1=CC=CC=C1
except sf.EncoderError:
pass # sf.encoder error!
except sf.DecoderError:
pass # sf.decoder error!
# SELFIES --> SMILES translation
decoded_smiles = sf.decoder(encoded_selfies) # 'C1=CC=CC=C1'
len_benzene = sf.len_selfies(benzene_sf) # 8
len_benzene = sf.len_selfies(encoded_selfies) # 8
symbols_benzene = list(sf.split_selfies(benzene_sf))
# ['[C]', '[=C]', '[C]', '[=C]', '[C]', '[=C]', '[Ring1]', '[=Branch1]']
```
symbols_benzene = list(sf.split_selfies(encoded_selfies))
# ['[C]', '[=C]', '[C]', '[=C]', '[C]', '[=C]', '[Ring1]', '[Branch1_2]']
#### Customizing SELFIES:
In this example, we relax the semantic constraints of ``selfies`` to allow
for hypervalences (caution: hypervalence rules are much less understood
than octet rules. Some molecules containing hypervalences are important,
but generally, it is not known which molecules are stable and reasonable).
```python
import selfies as sf
hypervalent_sf = sf.encoder('O=I(O)(O)(O)(O)O', strict=False) # orthoperiodic acid
standard_derived_smi = sf.decoder(hypervalent_sf)
# OI (the default constraints for I allows for only 1 bond)
sf.set_semantic_constraints("hypervalent")
relaxed_derived_smi = sf.decoder(hypervalent_sf)
# O=I(O)(O)(O)(O)O (the hypervalent constraints for I allows for 7 bonds)
```
#### Integer and one-hot encoding SELFIES:
In this example we first build an alphabet
from a dataset of SELFIES, and then convert a SELFIES into a
padded, label-encoded representation. Note that we use the
``'[nop]'`` ([no operation](https://en.wikipedia.org/wiki/NOP_(code) ))
In this example, we first build an alphabet from a dataset of SELFIES strings,
and then convert a SELFIES string into its padded encoding. Note that we use the
``[nop]`` ([no operation](https://en.wikipedia.org/wiki/NOP_(code) ))
symbol to pad our SELFIES, which is a special SELFIES symbol that is always

@@ -121,7 +142,6 @@ ignored and skipped over by ``selfies.decoder``, making it a useful

dataset = ['[C][O][C]', '[F][C][F]', '[O][=O]', '[C][C][O][C][C]']
dataset = ["[C][O][C]", "[F][C][F]", "[O][=O]", "[C][C][O][C][C]"]
alphabet = sf.get_alphabet_from_selfies(dataset)
alphabet.add('[nop]') # '[nop]' is a special padding symbol
alphabet = list(sorted(alphabet))
print(alphabet) # ['[=O]', '[C]', '[F]', '[O]', '[nop]']
alphabet.add("[nop]") # [nop] is a special padding symbol
alphabet = list(sorted(alphabet)) # ['[=O]', '[C]', '[F]', '[O]', '[nop]']

@@ -131,19 +151,15 @@ pad_to_len = max(sf.len_selfies(s) for s in dataset) # 5

# SELFIES to label encode
dimethyl_ether = dataset[0] # '[C][O][C]'
dimethyl_ether = dataset[0] # [C][O][C]
# [1, 3, 1, 4, 4]
print(sf.selfies_to_encoding(dimethyl_ether,
vocab_stoi=symbol_to_idx,
pad_to_len=pad_to_len,
enc_type='label'))
# [[0, 1, 0, 0, 0], [0, 0, 0, 1, 0], [0, 1, 0, 0, 0], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1]]
print(sf.selfies_to_encoding(dimethyl_ether,
vocab_stoi=symbol_to_idx,
pad_to_len=pad_to_len,
enc_type='one_hot'))
label, one_hot = sf.selfies_to_encoding(
selfies=dimethyl_ether,
vocab_stoi=symbol_to_idx,
pad_to_len=pad_to_len,
enc_type="both"
)
# label = [1, 3, 1, 4, 4]
# one_hot = [[0, 1, 0, 0, 0], [0, 0, 0, 1, 0], [0, 1, 0, 0, 0], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1]]
```
### More Examples
### More Usages and Examples

@@ -158,19 +174,6 @@ * More examples can be found in the ``examples/`` directory, including a

* Kohulan Rajan, Achim Zielesny, Christoph Steinbeck show in two papers that SELFIES outperforms other representations in [img2string](https://link.springer.com/article/10.1186/s13321-020-00469-w) and [string2string](https://chemrxiv.org/articles/preprint/STOUT_SMILES_to_IUPAC_Names_Using_Neural_Machine_Translation/13469202/1) translation tasks, see the codes of [DECIMER](https://github.com/Kohulan/DECIMER-Image-to-SMILES) and [STOUT](https://github.com/Kohulan/Smiles-TO-iUpac-Translator).
* An improvement to the old genetic algorithm, the authors have also released [JANUS](https://arxiv.org/abs/2106.04011), which allows for more efficient optimization in the chemical space. JANUS makes use of [STONED-SELFIES](https://pubs.rsc.org/en/content/articlepdf/2021/sc/d1sc00231g) and a neural network for efficient sampling.
## Handling invalid inputs
If an invalid input is presented to the encoder or decoder, the return value is `None`.
The error can be analysed by using the `encoder(...,print_error=True)` option.
```python
import selfies as sf
invalid_smiles="C[C@H](O)[C@@(*)C1=CC=CC=C1"
selfies_string=sf.encoder(invalid_smiles)
if selfies_string==None:
selfies_string=sf.encoder(invalid_smiles,print_error=True)
# 'Encoding error 'C[C@H](O)[C@@(*)C1=CC=CC=C1': wildcard atom '*' not supported.'
```
## Tests
SELFIES uses `pytest` with `tox` as its testing framework.
`selfies` uses `pytest` with `tox` as its testing framework.
All tests can be found in the `tests/` directory. To run the test suite for

@@ -180,29 +183,17 @@ SELFIES, install ``tox`` and run:

```bash
tox
tox -- --trials=10000 --dataset_samples=10000
```
By default, SELFIES is tested against a random subset
(of size ``dataset_samples=100000``) on various datasets:
By default, `selfies` is tested against a random subset
(of size ``dataset_samples=10000``) on various datasets:
* 130K molecules from [QM9](https://www.nature.com/articles/sdata201422)
* 250K molecules from [ZINC](https://en.wikipedia.org/wiki/ZINC_database)
* 50K molecules from [non-fullerene acceptors for organic solar cells](https://www.sciencedirect.com/science/article/pii/S2542435117301307)
* 8K molecules from [Tox21](http://moleculenet.ai/datasets-1) in MoleculeNet
* 93K molecules from PubChem [MUV](http://moleculenet.ai/datasets-1) in MoleculeNet
* 27M molecules from the [eMolecules Plus Database](https://www.emolecules.com/info/plus/download-database).
* 50K molecules from a dataset of [non-fullerene acceptors for organic solar cells](https://www.sciencedirect.com/science/article/pii/S2542435117301307)
* 160K+ molecules from various [MoleculeNet](http://moleculenet.ai/datasets-1) datasets
* 36M+ molecules from the [eMolecules Database](https://www.emolecules.com/info/products-data-downloads.html).
Due to its large size, this dataset is not included on the repository. To run tests
on it, please download the dataset in the ``tests/test_sets`` directory
and enable its pytest at ``tests/test_on_emolecules.py``.
on it, please download the dataset into the ``tests/test_sets`` directory
and run the ``tests/run_on_large_dataset.py`` script.
Other tests are random and repeated ``trials`` number of times.
These can be specified as arguments
```bash
tox -- --trials 100 --dataset_samples 100
```
where ``--trials=100000`` and ``--dataset_samples=100000`` by default. Note that
if ``dataset_samples`` is negative or exceeds the length of the dataset,
the whole dataset is used.
## Version History

@@ -213,5 +204,5 @@ See [CHANGELOG](https://github.com/aspuru-guzik-group/selfies/blob/master/CHANGELOG.md).

We thank Jacques Boitreaud, Andrew Brereton, Matthew Carbone (x94carbone), Nathan Frey (ncfrey), Theophile Gaudin,
HelloJocelynLu, Hyunmin Kim (hmkim), Minjie Li, Vincent Mallet, Alexander Minidis (DocMinus), Kevin Ryan (LeanAndMean),
Benjamin Sanchez-Lengeling, and Zhenpeng Yao for their suggestions and bug reports,
We thank Jacques Boitreaud, Andrew Brereton, Nessa Carson (supersciencegrl), Matthew Carbone (x94carbone), Vladimir Chupakhin (chupvl), Nathan Frey (ncfrey), Theophile Gaudin,
HelloJocelynLu, Hyunmin Kim (hmkim), Minjie Li, Vincent Mallet, Alexander Minidis (DocMinus), Kohulan Rajan (Kohulan),
Kevin Ryan (LeanAndMean), Benjamin Sanchez-Lengeling, Andrew White, Zhenpeng Yao and Adamo Young for their suggestions and bug reports,
and Robert Pollice for chemistry advices.

@@ -218,0 +209,0 @@

README.md
setup.py
selfies/__init__.py
selfies/bond_constraints.py
selfies/compatibility.py
selfies/constants.py
selfies/decoder.py
selfies/encoder.py
selfies/exceptions.py
selfies/grammar_rules.py
selfies/kekulize.py
selfies/utils.py
selfies/mol_graph.py
selfies.egg-info/PKG-INFO
selfies.egg-info/SOURCES.txt
selfies.egg-info/dependency_links.txt
selfies.egg-info/top_level.txt
selfies.egg-info/top_level.txt
selfies/utils/__init__.py
selfies/utils/encoding_utils.py
selfies/utils/linked_list.py
selfies/utils/matching_utils.py
selfies/utils/selfies_utils.py
selfies/utils/smiles_utils.py

@@ -18,7 +18,7 @@ #!/usr/bin/env python

Typical usage example:
import selfies
import selfies as sf
benzene = "C1=CC=CC=C1"
selfies_benzene = selfies.encoder(benzene)
smiles_benzene = selfies.decoder(selfies_benzene)
benzene_selfies = sf.encoder(benzene)
benzene_smiles = sf.decoder(benzene_selfies)

@@ -29,3 +29,3 @@ For comments, bug reports or feature ideas, please send an email to

__version__ = "1.0.3"
__version__ = "2.0.0"

@@ -35,6 +35,4 @@ __all__ = [

"decoder",
"get_preset_constraints",
"get_semantic_robust_alphabet",
"get_default_constraints",
"get_octet_rule_constraints",
"get_hypervalent_constraints",
"get_semantic_constraints",

@@ -49,22 +47,25 @@ "set_semantic_constraints",

"batch_flat_hot_to_selfies",
"EncoderError",
"DecoderError"
]
from .bond_constraints import (
get_preset_constraints,
get_semantic_constraints,
get_semantic_robust_alphabet,
set_semantic_constraints
)
from .decoder import decoder
from .encoder import encoder
from .grammar_rules import (
get_semantic_robust_alphabet,
get_default_constraints,
get_octet_rule_constraints,
get_hypervalent_constraints,
get_semantic_constraints,
set_semantic_constraints,
from .exceptions import DecoderError, EncoderError
from .utils.encoding_utils import (
batch_flat_hot_to_selfies,
batch_selfies_to_flat_hot,
encoding_to_selfies,
selfies_to_encoding
)
from .utils import (
from .utils.selfies_utils import (
get_alphabet_from_selfies,
len_selfies,
split_selfies,
selfies_to_encoding,
batch_selfies_to_flat_hot,
encoding_to_selfies,
batch_flat_hot_to_selfies,
split_selfies
)

@@ -1,372 +0,221 @@

from collections import OrderedDict
from typing import Dict, Iterable, List, Optional, Tuple, Union
import warnings
from selfies.grammar_rules import (get_bond_from_num,
get_hypervalent_constraints,
get_n_from_symbols, get_next_branch_state,
get_next_state, get_num_from_bond,
get_octet_rule_constraints,
get_semantic_constraints,
set_semantic_constraints)
from selfies.compatibility import modernize_symbol
from selfies.exceptions import DecoderError
from selfies.grammar_rules import (
get_index_from_selfies,
next_atom_state,
next_branch_state,
next_ring_state,
process_atom_symbol,
process_branch_symbol,
process_ring_symbol
)
from selfies.mol_graph import MolecularGraph
from selfies.utils.selfies_utils import split_selfies
from selfies.utils.smiles_utils import mol_to_smiles
def decoder(selfies: str,
print_error: bool = False,
constraints: Optional[str] = None) -> Optional[str]:
"""Translates a SELFIES into a SMILES.
def decoder(selfies: str, compatible: bool = False) -> str:
"""Translates a SELFIES string into its corresponding SMILES string.
The SELFIES to SMILES translation operates based on the :mod:`selfies`
grammar rules, which can be configured using
:func:`selfies.set_semantic_constraints`. Given the appropriate settings,
the decoded SMILES will always be syntactically and semantically correct.
That is, the output SMILES will satisfy the specified bond constraints.
Additionally, :func:`selfies.decoder` will attempt to preserve the
atom and branch order of the input SELFIES.
This translation is deterministic but depends on the current semantic
constraints. The output SMILES string is guaranteed to be syntatically
correct and guaranteed to represent a molecule that obeys the
semantic constraints.
:param selfies: the SELFIES to be translated.
:param print_error: if True, error messages will be printed to console.
Defaults to False.
:param constraints: if ``'octet_rule'`` or ``'hypervalent'``,
the corresponding preset bond constraints will be used instead.
If ``None``, :func:`selfies.decoder` will use the
currently configured bond constraints. Defaults to ``None``.
:return: the SMILES translation of ``selfies``. If an error occurs,
and ``selfies`` cannot be translated, ``None`` is returned instead.
:param selfies: the SELFIES string to be translated.
:param compatible: if ``True``, this function will accept SELFIES strings
containing depreciated symbols from previous releases. However, this
function may behave differently than in previous major relases,
and should not be treated as backard compatible.
Defaults to ``False``.
:return: a SMILES string derived from the input SELFIES string.
:raises DecoderError: if the input SELFIES string is malformed.
:Example:
>>> import selfies
>>> selfies.decoder('[C][=C][F]')
>>> import selfies as sf
>>> sf.decoder('[C][=C][F]')
'C=CF'
.. seealso:: The
`"octet_rule" <https://en.wikipedia.org/wiki/Octet_rule>`_
and
`"hypervalent" <https://en.wikipedia.org/wiki/Hypervalent_molecule>`_
preset bond constraints
can be viewed with :func:`selfies.get_octet_rule_constraints` and
:func:`selfies.get_hypervalent_constraints`, respectively. These
presets are variants of the "default" bond constraints, which can
be viewed with :func:`selfies.get_default_constraints`. Their
differences can be summarized as follows:
* def. : ``Cl``, ``Br``, ``I``: 1, ``N``: 3, ``P``: 5, ``P+1``: 6, ``P-1``: 4, ``S``: 6, ``S+1``: 7, ``S-1``: 5
* oct. : ``Cl``, ``Br``, ``I``: 1, ``N``: 3, ``P``: 3, ``P+1``: 4, ``P-1``: 2, ``S``: 2, ``S+1``: 3, ``S-1``: 1
* hyp. : ``Cl``, ``Br``, ``I``: 7, ``N``: 5, ``P``: 5, ``P+1``: 6, ``P-1``: 4, ``S``: 6, ``S+1``: 7, ``S-1``: 5
"""
old_constraints = get_semantic_constraints()
if constraints is None:
pass
elif constraints == 'octet_rule':
set_semantic_constraints(get_octet_rule_constraints())
elif constraints == 'hypervalent':
set_semantic_constraints(get_hypervalent_constraints())
else:
raise ValueError("unrecognized constraint type")
if compatible:
msg = "\nselfies.decoder() may behave differently than in previous " \
"major releases. We recommend using SELFIES that are up to date."
warnings.warn(msg, stacklevel=2)
try:
all_smiles = [] # process dot-separated fragments separately
mol = MolecularGraph()
for s in selfies.split("."):
smiles = _translate_selfies(s)
rings = []
for s in selfies.split("."):
_derive_mol_from_symbols(
symbol_iter=_tokenize_selfies(s, compatible),
mol=mol,
selfies=selfies,
max_derive=float("inf"),
init_state=0,
root_atom=None,
rings=rings
)
_form_rings_bilocally(mol, rings)
return mol_to_smiles(mol)
if smiles != "": # prevent malformed dots (e.g. [C]..[C], .[C][C])
all_smiles.append(smiles)
if constraints is not None: # restore old constraints
set_semantic_constraints(old_constraints)
def _tokenize_selfies(selfies, compatible):
if isinstance(selfies, str):
symbol_iter = split_selfies(selfies)
elif isinstance(selfies, list):
symbol_iter = selfies
else:
raise ValueError() # should not happen
return '.'.join(all_smiles)
try:
for symbol in symbol_iter:
if symbol == "[nop]":
continue
if compatible:
symbol = modernize_symbol(symbol)
yield symbol
except ValueError as err:
if constraints is not None: # restore old constraints
set_semantic_constraints(old_constraints)
raise DecoderError(str(err)) from None
if print_error:
print("Decoding error '{}': {}.".format(selfies, err))
return None
def _parse_selfies(selfies: str) -> Iterable[str]:
"""Parses a SELFIES into its symbols.
A generator, which parses a SELFIES and yields its symbols
one-by-one. When no symbols are left in the SELFIES, the empty
string is infinitely yielded. As a precondition, the input SELFIES contains
no dots, so all symbols are enclosed by square brackets, e.g. [X].
:param selfies: the SElFIES string to be parsed.
:return: an iterable of the symbols of the SELFIES.
"""
left_idx = selfies.find('[')
while 0 <= left_idx < len(selfies):
right_idx = selfies.find(']', left_idx + 1)
if (selfies[left_idx] != '[') or (right_idx == -1):
raise ValueError("malformed SELIFES, "
"misplaced or missing brackets")
next_symbol = selfies[left_idx: right_idx + 1]
left_idx = right_idx + 1
if next_symbol != '[nop]': # skip [nop]
yield next_symbol
while True: # no more symbols left
yield ''
def _parse_selfies_symbols(selfies_symbols: List[str]) -> Iterable[str]:
"""Equivalent to ``_parse_selfies``, except the input SELFIES is presented
as a list of SELFIES symbols, as opposed to a string.
:param selfies_symbols: a SELFIES represented as a list of SELFIES symbols.
:return: an iterable of the symbols of the SELFIES.
"""
for symbol in selfies_symbols:
if symbol != '[nop]':
yield symbol
while True:
yield ''
def _translate_selfies(selfies: str) -> str:
"""A helper for ``selfies.decoder``, which translates a SELFIES into a
SMILES (assuming the input SELFIES contains no dots).
:param selfies: the SELFIES to be translated.
:return: the SMILES translation of the SELFIES.
"""
selfies_gen = _parse_selfies(selfies)
# derived[i] is a list with three elements:
# (1) a string representing the i-th derived atom, and its connecting
# bond (e.g. =C, #N, N, C are all possible)
# (2) the number of available bonds the i-th atom has to make
# (3) the index of the previously derived atom that the i-th derived
# atom is bonded to
# Example: if the 6-th derived atom was 'C', had 2 available bonds,
# and was connected to the 5-th derived atom by a double bond, then
# derived[6] = ['=C', 2, 5]
derived = []
# each item of <branches> is a key-value pair of indices that represents
# the branches to be made. If a branch starts at the i-th derived atom
# and ends at the j-th derived atom, then branches[i] = j. No two
# branches should start at the same atom, e.g. C((C)Cl)C
branches = {}
# each element of <rings> is a tuple of size three that represents the
# rings to be made, in the same order they appear in the SELFIES (left
# to right). If the i-th ring is between the j-th and k-th derived atoms
# (j <= k) and has bond symbol s ('=', '#', '\', etc.), then
# rings[i] = (j, k, s).
rings = []
_translate_selfies_derive(selfies_gen, 0, derived, -1, branches, rings)
_form_rings_bilocally(derived, rings)
# create branches
for lb, rb in branches.items():
derived[lb][0] = '(' + derived[lb][0]
derived[rb][0] += ')'
smiles = ""
for s, _, _ in derived: # construct SMILES from <derived>
smiles += s
return smiles
# flake8: noqa: C901
# noinspection PyTypeChecker
def _translate_selfies_derive(selfies_gen: Iterable[str],
init_state: int,
derived: List[List[Union[str, int]]],
prev_idx: int,
branches: Dict[int, int],
rings: List[Tuple[int, int, str]]) -> None:
"""Recursive helper for _translate_selfies.
Derives the SMILES symbols one-by-one from a SELFIES, and
populates derived, branches, and rings. The main chain and side branches
of the SELFIES are translated recursively. Rings are not actually
translated, but saved to the rings list to be added later.
:param selfies_gen: an iterable of the symbols of the SELFIES to be
translated, created by ``_parse_selfies``.
:param init_state: the initial derivation state.
:param derived: see ``derived`` in ``_translate_selfies``.
:param prev_idx: the index of the previously derived atom, or -1,
if no atoms have been derived yet.
:param branches: see ``branches`` in ``_translate_selfies``.
:param rings: see ``rings`` in ``_translate_selfies``.
:return: ``None``.
"""
curr_symbol = next(selfies_gen)
def _derive_mol_from_symbols(
symbol_iter, mol, selfies, max_derive,
init_state, root_atom, rings
):
n_derived = 0
state = init_state
prev_atom = root_atom
while curr_symbol != '' and state >= 0:
while (state is not None) and (n_derived < max_derive):
# Case 1: Branch symbol (e.g. [Branch1_2])
if 'Branch' in curr_symbol:
try: # retrieve next symbol
symbol = next(symbol_iter)
n_derived += 1
except StopIteration:
break
branch_init_state, new_state = \
get_next_branch_state(curr_symbol, state)
# Case 1: Branch symbol (e.g. [Branch1])
if "ch" == symbol[-4:-2]:
if state <= 1: # state = 0, 1
pass # ignore no symbols
output = process_branch_symbol(symbol)
if output is None:
_raise_decoder_error(selfies, symbol)
btype, n = output
if state <= 1:
next_state = state
else:
L = int(curr_symbol[-4]) # corresponds to [BranchL_X]
L_symbols = []
for _ in range(L):
L_symbols.append(next(selfies_gen))
binit_state, next_state = next_branch_state(btype, state)
N = get_n_from_symbols(*L_symbols)
Q = _read_index_from_selfies(symbol_iter, n_symbols=n)
n_derived += n + _derive_mol_from_symbols(
symbol_iter, mol, selfies, (Q + 1),
init_state=binit_state, root_atom=prev_atom, rings=rings
)
branch_symbols = []
for _ in range(N + 1):
branch_symbols.append(next(selfies_gen))
branch_gen = _parse_selfies_symbols(branch_symbols)
branch_start = len(derived)
_translate_selfies_derive(branch_gen, branch_init_state,
derived, prev_idx, branches, rings)
branch_end = len(derived) - 1
# resolve C((C)Cl)C --> C(C)(Cl)C
while branch_start in branches:
branch_start = branches[branch_start] + 1
# finally, register the branch in branches
if branch_start <= branch_end:
branches[branch_start] = branch_end
# Case 2: Ring symbol (e.g. [Ring2])
elif 'Ring' in curr_symbol:
elif "ng" == symbol[-4:-2]:
new_state = state
output = process_ring_symbol(symbol)
if output is None:
_raise_decoder_error(selfies, symbol)
ring_type, n, stereo = output
if state == 0:
pass # ignore no symbols
next_state = state
else:
L = int(curr_symbol[-2]) # corresponds to [RingL]
L_symbols = []
for _ in range(L):
L_symbols.append(next(selfies_gen))
ring_order, next_state = next_ring_state(ring_type, state)
bond_info = (ring_order, stereo)
N = get_n_from_symbols(*L_symbols)
Q = _read_index_from_selfies(symbol_iter, n_symbols=n)
n_derived += n
lidx = max(0, prev_atom.index - (Q + 1))
rings.append((mol.get_atom(lidx), prev_atom, bond_info))
left_idx = max(0, prev_idx - (N + 1))
right_idx = prev_idx
# Case 3: [epsilon]
elif "eps" in symbol:
next_state = 0 if (state == 0) else None
bond_symbol = ''
if curr_symbol[1:5] == 'Expl':
bond_symbol = curr_symbol[5]
rings.append((left_idx, right_idx, bond_symbol))
# Case 3: regular symbol (e.g. [N], [=C], [F])
# Case 4: regular symbol (e.g. [N], [=C], [F])
else:
new_symbol, new_state = get_next_state(curr_symbol, state)
if new_symbol != '': # in case of [epsilon]
derived.append([new_symbol, new_state, prev_idx])
output = process_atom_symbol(symbol)
if output is None:
_raise_decoder_error(selfies, symbol)
(bond_order, stereo), atom = output
cap = atom.bonding_capacity
if prev_idx >= 0:
bond_num = get_num_from_bond(new_symbol[0])
derived[prev_idx][1] -= bond_num
bond_order, next_state = next_atom_state(bond_order, cap, state)
if bond_order == 0:
if state == 0:
mol.add_atom(atom, True)
else:
mol.add_atom(atom)
src, dst = prev_atom.index, atom.index
mol.add_bond(src=src, dst=dst, order=bond_order, stereo=stereo)
prev_atom = atom
prev_idx = len(derived) - 1
if next_state is None:
break
state = next_state
curr_symbol = next(selfies_gen) # update symbol and state
state = new_state
while n_derived < max_derive: # consume remaining tokens
try:
next(symbol_iter)
n_derived += 1
except StopIteration:
break
return n_derived
def _form_rings_bilocally(derived: List[List[Union[str, int]]],
rings: List[Tuple[int, int, str]]) -> None:
"""Forms all the rings specified by the rings list, in first-to-last order,
by updating derived.
:param derived: see ``derived`` in ``_translate_selfies``.
:param rings: see ``rings`` in ``_translate_selfies``.
:return: ``None``.
"""
def _raise_decoder_error(selfies, invalid_symbol):
err_msg = "invalid symbol '{}'\n\tSELFIES: {}".format(
invalid_symbol, selfies
)
raise DecoderError(err_msg)
# due to the behaviour of allowing multiple rings between the same atom
# pair, or rings between already bonded atoms, we first resolve all rings
# so that only valid rings are left and placed into <ring_locs>.
ring_locs = OrderedDict()
for left_idx, right_idx, bond_symbol in rings:
def _read_index_from_selfies(symbol_iter, n_symbols):
index_symbols = []
for _ in range(n_symbols):
try:
index_symbols.append(next(symbol_iter))
except StopIteration:
index_symbols.append(None)
return get_index_from_selfies(*index_symbols)
if left_idx == right_idx: # ring to the same atom forbidden
continue
left_end = derived[left_idx]
right_end = derived[right_idx]
bond_num = get_num_from_bond(bond_symbol)
def _form_rings_bilocally(mol, rings):
rings_made = [0] * len(mol)
if left_end[1] <= 0 or right_end[1] <= 0:
continue # no room for bond
for latom, ratom, bond_info in rings:
lidx, ridx = latom.index, ratom.index
if bond_num > min(left_end[1], right_end[1]):
bond_num = min(left_end[1], right_end[1])
bond_symbol = get_bond_from_num(bond_num)
if lidx == ridx: # ring to the same atom forbidden
continue
# ring is formed between two atoms that are already bonded
# e.g. CC1C1C --> CC=CC
if left_idx == right_end[2]:
order, (lstereo, rstereo) = bond_info
lfree = latom.bonding_capacity - mol.get_bond_count(lidx)
rfree = ratom.bonding_capacity - mol.get_bond_count(ridx)
right_symbol = right_end[0]
if lfree <= 0 or rfree <= 0:
continue # no room for ring bond
order = min(order, lfree, rfree)
if right_symbol[0] in {'-', '/', '\\', '=', '#'}:
old_bond = right_symbol[0]
else:
old_bond = ''
if mol.has_bond(a=lidx, b=ridx):
bond = mol.get_dirbond(src=lidx, dst=ridx)
new_order = min(order + bond.order, 3)
mol.update_bond_order(a=lidx, b=ridx, new_order=new_order)
# update bond multiplicity and symbol
new_bond_num = min(bond_num + get_num_from_bond(old_bond), 3)
new_bond_symbol = get_bond_from_num(new_bond_num)
right_end[0] = new_bond_symbol + right_end[0][len(old_bond):]
# ring is formed between two atoms that are not bonded, e.g. C1CC1C
else:
loc = (left_idx, right_idx)
if loc in ring_locs:
# a ring is formed between two atoms that are have previously
# been bonded by a ring, so ring bond multiplicity is updated
new_bond_num = min(bond_num
+ get_num_from_bond(ring_locs[loc]), 3)
new_bond_symbol = get_bond_from_num(new_bond_num)
ring_locs[loc] = new_bond_symbol
else:
ring_locs[loc] = bond_symbol
left_end[1] -= bond_num
right_end[1] -= bond_num
# finally, use <ring_locs> to add all the rings into <derived>
ring_counter = 1
for (left_idx, right_idx), bond_symbol in ring_locs.items():
ring_id = str(ring_counter)
if len(ring_id) == 2:
ring_id = "%" + ring_id
ring_counter += 1 # increment
derived[left_idx][0] += bond_symbol + ring_id
derived[right_idx][0] += bond_symbol + ring_id
mol.add_ring_bond(
a=lidx, a_stereo=lstereo, a_pos=rings_made[lidx],
b=ridx, b_stereo=rstereo, b_pos=rings_made[ridx],
order=order
)
rings_made[lidx] += 1
rings_made[ridx] += 1

@@ -1,265 +0,203 @@

from typing import Dict, Iterable, List, Optional, Tuple
from selfies.exceptions import EncoderError, SMILESParserError
from selfies.grammar_rules import get_selfies_from_index
from selfies.utils.linked_list import SinglyLinkedList
from selfies.utils.smiles_utils import (
atom_to_smiles,
bond_to_smiles,
smiles_to_mol
)
from selfies.grammar_rules import get_num_from_bond, get_symbols_from_n
from selfies.kekulize import kekulize_parser
def encoder(smiles: str, strict: bool = True) -> str:
"""Translates a SMILES string into its corresponding SELFIES string.
def encoder(smiles: str, print_error: bool = False) -> Optional[str]:
"""Translates a SMILES into a SELFIES.
This translation is deterministic and does not depend on the
current semantic constraints. Additionally, it preserves the atom order
of the input SMILES string; thus, one could generate randomized SELFIES
strings by generating randomized SMILES strings, and then translating them.
The SMILES to SELFIES translation occurs independently of the SELFIES
alphabet and grammar. Thus, :func:`selfies.encoder` will work regardless of
the alphabet and grammar rules that :py:mod:`selfies` is operating on,
assuming the input is a valid SMILES. Additionally, :func:`selfies.encoder`
preserves the atom and branch order of the input SMILES; thus, one
could generate random SELFIES corresponding to the same molecule by
generating random SMILES, and then translating them.
By nature of SELFIES, it is impossible to represent molecules that
violate the current semantic constraints as SELFIES strings.
Thus, we provide the ``strict`` flag to guard against such cases. If
``strict=True``, then this function will raise a
:class:`selfies.EncoderError` if the input SMILES string represents
a molecule that violates the semantic constraints. If
``strict=False``, then this function will not raise any error; however,
calling :func:`selfies.decoder` on a SELFIES string generated this
way will *not* be guaranteed to recover a SMILES string representing
the original molecule.
However, encoding and then decoding a SMILES may not necessarily yield
the original SMILES. Reasons include:
:param smiles: the SMILES string to be translated. It is recommended to
use RDKit to check that the strings passed into this function
are valid SMILES strings.
:param strict: if ``True``, this function will check that the
input SMILES string obeys the semantic constraints.
Defaults to ``True``.
:return: a SELFIES string translated from the input SMILES string.
:raises EncoderError: if the input SMILES string is invalid,
cannot be kekulized, or violates the semantic constraints with
``strict=True``.
1. SMILES with aromatic symbols are automatically
Kekulized before being translated.
2. SMILES that violate the bond constraints specified by
:mod:`selfies` will be successfully encoded by
:func:`selfies.encoder`, but then decoded into a new molecule
that satisfies the constraints.
3. The exact ring numbering order is lost in :func:`selfies.encoder`,
and cannot be reconstructed by :func:`selfies.decoder`.
Finally, note that :func:`selfies.encoder` does **not** check if the input
SMILES is valid, and should not be expected to reject invalid inputs.
It is recommended to use RDKit to first verify that the SMILES are
valid.
:param smiles: the SMILES to be translated.
:param print_error: if True, error messages will be printed to console.
Defaults to False.
:return: the SELFIES translation of ``smiles``. If an error occurs,
and ``smiles`` cannot be translated, :code:`None` is returned instead.
:Example:
>>> import selfies
>>> selfies.encoder('C=CF')
>>> import selfies as sf
>>> sf.encoder("C=CF")
'[C][=C][F]'
.. note:: Currently, :func:`selfies.encoder` does not support the
following types of SMILES:
.. note:: This function does not currently support SMILES with:
* SMILES using ring numbering across a dot-bond symbol
to specify bonds, e.g. ``C1.C2.C12`` (propane) or
``c1cc([O-].[Na+])ccc1`` (sodium phenoxide).
* SMILES with ring numbering between atoms that are over
``16 ** 3 = 4096`` atoms apart.
* SMILES using the wildcard symbol ``*``.
* SMILES using chiral specifications other than ``@`` and ``@@``.
* The wildcard symbol ``*``.
* The quadruple bond symbol ``$``.
* Chirality specifications other than ``@`` and ``@@``.
* Ring bonds across a dot symbol (e.g. ``c1cc([O-].[Na+])ccc1``) or
ring bonds between atoms that are over 4000 atoms apart.
Although SELFIES does not have aromatic symbols, this function
*does* support aromatic SMILES strings by internally kekulizing them
before translation.
"""
try:
if '*' in smiles:
raise ValueError("wildcard atom '*' not supported")
mol = smiles_to_mol(smiles)
except SMILESParserError as err:
err_msg = "failed to parse input\n\tSMILES: {}".format(smiles)
raise EncoderError(err_msg) from err
all_selfies = [] # process dot-separated fragments separately
for s in smiles.split("."):
all_selfies.append(_translate_smiles(s))
return '.'.join(all_selfies)
if not mol.kekulize():
err_msg = "kekulization failed\n\tSMILES: {}".format(smiles)
raise EncoderError(err_msg)
except ValueError as err:
if print_error:
print("Encoding error '{}': {}.".format(smiles, err))
return None
if strict:
_check_bond_constraints(mol, smiles)
# invert chirality of atoms where necessary,
# such that they are restored when the SELFIES is decoded
for atom in mol.get_atoms():
if ((atom.chirality is not None)
and mol.has_out_ring_bond(atom.index)
and _should_invert_chirality(mol, atom)):
atom.invert_chirality()
ATOM_TYPE = 1
BRANCH_TYPE = 2
RING_TYPE = 3
fragments = []
for root in mol.get_roots():
derived = list(_fragment_to_selfies(mol, None, root))
fragments.append("".join(derived))
return ".".join(fragments)
def _parse_smiles(smiles: str) -> Iterable[Tuple[str, str, int]]:
"""Parses a SMILES into its symbols.
def _check_bond_constraints(mol, smiles):
errors = []
A generator, which parses a SMILES string and returns its symbol(s)
one-by-one as a tuple of:
(1) the bond symbol connecting the current atom/ring/branch symbol
to the previous atom/ring/branch symbol (e.g. '=', '', '#')
(2) the atom/ring/branch symbol as a string (e.g. 'C', '12', '(')
(3) the type of the symbol in (2), represented as an integer that is
either ``ATOM_TYPE``, ``BRANCH_TYPE``, and ``RING_TYPE``.
As a precondition, we also assume ``smiles`` has no dots in it.
for atom in mol.get_atoms():
bond_cap = atom.bonding_capacity
bond_count = mol.get_bond_count(atom.index)
if bond_count > bond_cap:
errors.append((atom_to_smiles(atom), bond_count, bond_cap))
:param smiles: the SMILES to be parsed.
:return: an iterable of the symbol(s) of the SELFIES along with
their types.
"""
if errors:
err_msg = "input violates the currently-set semantic constraints\n" \
"\tSMILES: {}\n" \
"\tErrors:\n".format(smiles)
for e in errors:
err_msg += "\t[{:} with {} bond(s) - " \
"a max. of {} bond(s) was specified]\n".format(*e)
raise EncoderError(err_msg)
i = 0
while 0 <= i < len(smiles):
def _should_invert_chirality(mol, atom):
out_bonds = mol.get_out_dirbonds(atom.index)
bond = ''
if smiles[i] in {'-', '/', '\\', '=', '#', ":"}:
bond = smiles[i]
i += 1
if smiles[i].isalpha(): # organic subset elements
if smiles[i: i + 2] in ('Br', 'Cl'): # two letter elements
symbol = smiles[i: i + 2]
symbol_type = ATOM_TYPE
i += 2
else:
symbol = smiles[i] # one letter elements (e.g. C, N, ...)
symbol_type = ATOM_TYPE
i += 1
elif smiles[i] in ('(', ')'): # open and closed branch brackets
bond = smiles[i + 1: i + 2]
symbol = smiles[i]
symbol_type = BRANCH_TYPE
i += 1
elif smiles[i] == '[': # atoms encased in brackets (e.g. [NH])
r_idx = smiles.find(']', i + 1)
symbol = smiles[i: r_idx + 1]
symbol_type = ATOM_TYPE
i = r_idx + 1
if r_idx == -1:
raise ValueError("malformed SMILES, missing ']'")
# quick chirality specification check
chiral_i = symbol.find('@')
if symbol[chiral_i + 1].isalpha() and symbol[chiral_i + 1] != 'H':
raise ValueError("chiral specification '{}' not supported"
.format(symbol))
elif smiles[i].isdigit(): # one-digit ring number
symbol = smiles[i]
symbol_type = RING_TYPE
i += 1
elif smiles[i] == '%': # two-digit ring number (e.g. %12)
symbol = smiles[i + 1: i + 3]
symbol_type = RING_TYPE
i += 3
# 1. rings whose right number are bonded to this atom (e.g. ...1...X1)
# 2. rings whose left number are bonded to this atom (e.g. X1...1...)
# 3. branches and other (e.g. X(...)...)
partition = [[], [], []]
for i, bond in enumerate(out_bonds):
if not bond.ring_bond:
partition[2].append(i)
elif bond.src < bond.dst:
partition[1].append(i)
else:
raise ValueError("unrecognized symbol '{}'".format(smiles[i]))
partition[0].append(i)
partition[1].sort(key=lambda x: out_bonds[x].dst)
yield bond, symbol, symbol_type
# construct permutation
perm = partition[0] + partition[1] + partition[2]
count = 0
for i in range(len(perm)):
for j in range(i + 1, len(perm)):
if perm[i] > perm[j]:
count += 1
return count % 2 != 0 # if odd permutation, should invert chirality
def _translate_smiles(smiles: str) -> str:
"""A helper for ``selfies.encoder``, which translates a SMILES into a
SELFIES (assuming the input SMILES contains no dots).
def _fragment_to_selfies(mol, bond_into_root, root):
derived = SinglyLinkedList()
:param smiles: the SMILES to be translated.
:return: the SELFIES translation of SMILES.
"""
bond_into_curr, curr = bond_into_root, root
while True:
curr_atom = mol.get_atom(curr)
derived.append(_atom_to_selfies(bond_into_curr, curr_atom))
smiles_gen = _parse_smiles(smiles)
out_bonds = mol.get_out_dirbonds(curr)
for i, bond in enumerate(out_bonds):
char_set = set(smiles)
if any(c in char_set for c in ['c', 'n', 'o', 'p', 'a', 's']):
smiles_gen = kekulize_parser(smiles_gen)
if bond.ring_bond:
if bond.src < bond.dst:
continue
# a simple mutable counter to track which atom was the i-th derived atom
derive_counter = [0]
rev_bond = mol.get_dirbond(src=bond.dst, dst=bond.src)
ring_len = bond.src - bond.dst
Q_as_symbols = get_selfies_from_index(ring_len - 1)
ring_symbol = "[{}Ring{}]".format(
_ring_bonds_to_selfies(rev_bond, bond),
len(Q_as_symbols)
)
# a dictionary to keep track of the rings to be made. If a ring with id
# X is connected to the i-th and j-th derived atoms (i < j) with bond
# symbol s, then after the i-th atom is derived, rings[X] = (s, i).
# As soon as the j-th atom is derived, rings[X] is removed from <rings>,
# and the ring is made.
rings = {}
derived.append(ring_symbol)
for symbol in Q_as_symbols:
derived.append(symbol)
selfies, _ = _translate_smiles_derive(smiles_gen, rings, derive_counter)
elif i == len(out_bonds) - 1:
bond_into_curr, curr = bond, bond.dst
if rings:
raise ValueError("malformed ring numbering or ring numbering "
"across a dot symbol")
return selfies
def _translate_smiles_derive(smiles_gen: Iterable[Tuple[str, str, int]],
rings: Dict[int, Tuple[str, int]],
counter: List[int]) -> Tuple[str, int]:
"""Recursive helper for _translate_smiles.
Derives the SELFIES from a SMILES, and returns a tuple of (1) the
translated SELFIES and (2) the symbol length of the translated SELFIES.
:param smiles_gen: an iterable of the symbols (and their types)
of the SMILES to be translated, created by ``_parse_smiles``.
:param rings: See ``rings`` in ``_translate_smiles``.
:param counter: a one-element list that serves as a mutable counter.
See ``derived_counter`` in ``_translate_smiles``.
:return: A tuple of the translated SELFIES and its symbol length.
"""
selfies = ""
selfies_len = 0
prev_idx = -1
for bond, symbol, symbol_type in smiles_gen:
if bond == '-': # ignore explicit single bonds
bond = ''
if symbol_type == ATOM_TYPE:
if symbol[0] == '[':
selfies += "[{}{}expl]".format(bond, symbol[1:-1])
else:
selfies += "[{}{}]".format(bond, symbol)
prev_idx = counter[0]
counter[0] += 1
selfies_len += 1
branch = _fragment_to_selfies(mol, bond, bond.dst)
Q_as_symbols = get_selfies_from_index(len(branch) - 1)
branch_symbol = "[{}Branch{}]".format(
_bond_to_selfies(bond, show_stereo=False),
len(Q_as_symbols)
)
elif symbol_type == BRANCH_TYPE:
if symbol == '(':
derived.append(branch_symbol)
for symbol in Q_as_symbols:
derived.append(symbol)
derived.extend(branch)
# NOTE: looping inside a loop on a generator will produce
# expected behaviour in this case.
# end of chain
if (not out_bonds) or out_bonds[-1].ring_bond:
break
branch, branch_len = \
_translate_smiles_derive(smiles_gen, rings, counter)
return derived
N_as_symbols = get_symbols_from_n(branch_len - 1)
bond_num = get_num_from_bond(bond)
selfies += "[Branch{}_{}]".format(len(N_as_symbols), bond_num)
selfies += ''.join(N_as_symbols) + branch
selfies_len += 1 + len(N_as_symbols) + branch_len
def _bond_to_selfies(bond, show_stereo=True):
if not show_stereo and (bond.order == 1):
return ""
return bond_to_smiles(bond)
else: # symbol == ')'
break
else: # symbol_type == RING_TYPE
ring_id = int(symbol)
def _ring_bonds_to_selfies(lbond, rbond):
assert lbond.order == rbond.order
if ring_id in rings:
left_bond, left_end = rings.pop(ring_id)
right_bond, right_end = bond, prev_idx
if (lbond.order != 1) or all(b.stereo is None for b in (lbond, rbond)):
return _bond_to_selfies(lbond, show_stereo=False)
else:
bond_char = "-" if (lbond.stereo is None) else lbond.stereo
bond_char += "-" if (rbond.stereo is None) else rbond.stereo
return bond_char
ring_len = right_end - left_end
N_as_symbols = get_symbols_from_n(ring_len - 1)
if left_bond != '':
selfies += "[Expl{}Ring{}]".format(left_bond,
len(N_as_symbols))
elif right_bond != '':
selfies += "[Expl{}Ring{}]".format(right_bond,
len(N_as_symbols))
else:
selfies += "[Ring{}]".format(len(N_as_symbols))
selfies += ''.join(N_as_symbols)
selfies_len += 1 + len(N_as_symbols)
else:
rings[ring_id] = (bond, prev_idx)
return selfies, selfies_len
def _atom_to_selfies(bond, atom):
assert not atom.is_aromatic
bond_char = "" if (bond is None) else _bond_to_selfies(bond)
return "[{}{}]".format(bond_char, atom_to_smiles(atom, brackets=False))

@@ -1,428 +0,208 @@

from itertools import product
from typing import Dict, List, Optional, Set, Tuple
import functools
import itertools
import re
from typing import Any, List, Optional, Tuple
default_bond_constraints = {
'H': 1, 'F': 1, 'Cl': 1, 'Br': 1, 'I': 1,
'O': 2, 'O+1': 3, 'O-1': 1,
'N': 3, 'N+1': 4, 'N-1': 2,
'C': 4, 'C+1': 5, 'C-1': 3,
'P': 5, 'P+1': 6, 'P-1': 4,
'S': 6, 'S+1': 7, 'S-1': 5,
'?': 8
}
octet_rule_bond_constraints = dict(default_bond_constraints)
octet_rule_bond_constraints.update(
{'S': 2, 'S+1': 3, 'S-1': 1, 'P': 3, 'P+1': 4, 'P-1': 2}
from selfies.constants import (
ELEMENTS,
INDEX_ALPHABET,
INDEX_CODE,
ORGANIC_SUBSET
)
from selfies.mol_graph import Atom
from selfies.utils.smiles_utils import smiles_to_bond
hypervalent_bond_constraints = dict(default_bond_constraints)
hypervalent_bond_constraints.update(
{'Cl': 7, 'Br': 7, 'I': 7, 'N': 5}
)
_bond_constraints = default_bond_constraints
def process_atom_symbol(symbol: str) -> Optional[Tuple[Any, Atom]]:
try:
output = _PROCESS_ATOM_CACHE[symbol]
except KeyError:
output = _process_atom_selfies_no_cache(symbol)
if output is None:
return None
_PROCESS_ATOM_CACHE[symbol] = output
bond_info, atom_fac = output
atom = atom_fac()
if atom.bonding_capacity < 0:
return None # too many Hs (e.g. [CH9]
return bond_info, atom
def get_semantic_robust_alphabet() -> Set[str]:
"""Returns a subset of all symbols that are semantically constrained
by :mod:`selfies`.
These semantic constraints can be configured with
:func:`selfies.set_semantic_constraints`.
def process_branch_symbol(symbol: str) -> Optional[Tuple[int, int]]:
try:
return _PROCESS_BRANCH_CACHE[symbol]
except KeyError:
return None
:return: a subset of all symbols that are semantically constrained.
"""
alphabet_subset = set()
def process_ring_symbol(symbol: str) -> Optional[Tuple[int, int, Any]]:
try:
return _PROCESS_RING_CACHE[symbol]
except KeyError:
return None
organic_subset = {'B', 'C', 'N', 'O', 'S', 'P', 'F', 'Cl', 'Br', 'I'}
bonds = {'': 1, '=': 2, '#': 3}
# add atomic symbols
for (a, c), (b, m) in product(_bond_constraints.items(), bonds.items()):
if (m > c) or (a == '?'):
continue
if a in organic_subset:
symbol = "[{}{}]".format(b, a)
else:
symbol = "[{}{}expl]".format(b, a)
alphabet_subset.add(symbol)
# add branch and ring symbols
for i in range(1, 4):
alphabet_subset.add("[Ring{}]".format(i))
alphabet_subset.add("[Expl=Ring{}]".format(i))
for j in range(1, 4):
alphabet_subset.add("[Branch{}_{}]".format(i, j))
return alphabet_subset
def get_default_constraints() -> Dict[str, int]:
"""Returns the preset "default" bond constraint settings.
:return: the default constraint settings.
"""
global default_bond_constraints
return dict(default_bond_constraints)
def get_octet_rule_constraints() -> Dict[str, int]:
"""Returns the preset "octet rule" bond constraint settings. These
constraints are a harsher version of the default constraints, so that
the `octet rule <https://en.wikipedia.org/wiki/Octet_rule>`_
is obeyed. In particular, ``S`` and ``P`` are
restricted to a 2 and 3 bond capacity, respectively (and similarly with
``S+``, ``S-``, ``P+``, ``P-``).
:return: the octet rule constraint settings.
"""
global octet_rule_bond_constraints
return dict(octet_rule_bond_constraints)
def get_hypervalent_constraints() -> Dict[str, int]:
"""Returns the preset "hypervalent" bond constraint settings. These
constraints are a relaxed version of the default constraints, to allow
for `hypervalent molecules
<https://en.wikipedia.org/wiki/Hypervalent_molecule>`_.
In particular, ``Cl``, ``Br``, and ``I``
are relaxed to a 7 bond capacity, and ``N`` is relaxed to a 5 bond
capacity.
:return: the hypervalent constraint settings.
"""
global hypervalent_bond_constraints
return dict(hypervalent_bond_constraints)
def get_semantic_constraints() -> Dict[str, int]:
"""Returns the semantic bond constraints that :mod:`selfies` is currently
operating on.
Returned is the argument of the most recent call of
:func:`selfies.set_semantic_constraints`, or the default bond constraints
if the function has not been called yet. Once retrieved, it is copied and
then returned. See :func:`selfies.set_semantic_constraints` for further
explanation.
:return: the bond constraints :mod:`selfies` is currently operating on.
"""
global _bond_constraints
return dict(_bond_constraints)
def set_semantic_constraints(
bond_constraints: Optional[Dict[str, int]] = None) -> None:
"""Configures the semantic constraints of :mod:`selfies`.
The SELFIES grammar is enforced dynamically from a dictionary
``bond_constraints``. The keys of the dictionary are atoms and/or ions
(e.g. ``I``, ``Fe+2``). To denote an ion, use the format ``E+C``
or ``E-C``, where ``E`` is an element and ``C`` is a positive integer.
The corresponding value is the maximum number of bonds that atom or
ion can make, between 1 and 8 inclusive. For example, one may have:
* ``bond_constraints['I'] = 1``
* ``bond_constraints['C'] = 4``
:func:`selfies.decoder` will only generate SMILES that respect the bond
constraints specified by the dictionary. In the example above, both
``'[C][=I]'`` and ``'[I][=C]'`` will be translated to ``'CI'`` and
``'IC'`` respectively, because ``I`` has been configured to make one bond
maximally.
If an atom or ion is not specified in ``bond_constraints``, it will
by default be constrained to 8 bonds. To change the default setting
for unrecognized atoms or ions, set ``bond_constraints['?']`` to the
desired integer (between 1 and 8 inclusive).
:param bond_constraints: a dictionary representing the semantic
constraints the updated SELFIES will operate upon. Defaults to
``None``; in this case, a default dictionary will be used.
:return: ``None``.
"""
global _bond_constraints
if bond_constraints is None:
_bond_constraints = default_bond_constraints
else:
# error checking
if '?' not in bond_constraints:
raise ValueError("bond_constraints missing '?' as a key.")
for key, value in bond_constraints.items():
if not (1 <= value <= 8):
raise ValueError("bond_constraints['{}'] not between "
"1 and 8 inclusive.".format(key))
_bond_constraints = dict(bond_constraints)
# Symbol State Dict Functions ==============================================
def get_next_state(symbol: str, state: int) -> Tuple[str, int]:
"""Enforces the grammar rules for standard SELFIES symbols.
Given the current non-branch, non-ring symbol and current derivation
state, retrieves the derived SMILES symbol and the next derivation
state.
:param symbol: a SELFIES symbol that is not a Ring or Branch.
:param state: the current derivation state.
:return: a tuple of (1) the derived symbol, and
(2) the next derivation state.
"""
if symbol == '[epsilon]':
return ('', 0) if state == 0 else ('', -1)
# convert to smiles symbol
bond = ''
if symbol[1] in {'/', '\\', '=', '#'}:
bond = symbol[1]
bond_num = get_num_from_bond(bond)
if symbol[-5:] == 'expl]': # e.g. [C@@Hexpl]
smiles_symbol = "[{}]".format(symbol[1 + len(bond):-5])
else:
smiles_symbol = symbol[1 + len(bond):-1]
# get bond capacity
element, h_count, charge = parse_atom_symbol(smiles_symbol)
if charge == 0:
atom_or_ion = element
else:
atom_or_ion = "{}{:+}".format(element, charge)
max_bonds = _bond_constraints.get(atom_or_ion,
_bond_constraints['?'])
if (h_count > max_bonds) or (h_count == max_bonds and state > 0):
raise ValueError("too many Hs in symbol '{}'; consider "
"adjusting bond constraints".format(symbol))
max_bonds -= h_count # hydrogens consume 1 bond
# calculate next state
def next_atom_state(
bond_order: int, bond_cap: int, state: int
) -> Tuple[int, Optional[int]]:
if state == 0:
bond = ''
next_state = max_bonds
else:
if bond_num > min(state, max_bonds):
bond_num = min(state, max_bonds)
bond = get_bond_from_num(bond_num)
bond_order = 0
next_state = max_bonds - bond_num
if next_state == 0:
next_state = -1
bond_order = min(bond_order, state, bond_cap)
bonds_left = bond_cap - bond_order
next_state = None if (bonds_left == 0) else bonds_left
return bond_order, next_state
return (bond + smiles_symbol), next_state
def next_branch_state(
branch_type: int, state: int
) -> Tuple[int, Optional[int]]:
assert 1 <= branch_type <= 3
assert state > 1
# Branch State Dict Functions =================================================
branch_init_state = min(state - 1, branch_type)
next_state = state - branch_init_state
return branch_init_state, next_state
def get_next_branch_state(branch_symbol: str, state: int) -> Tuple[int, int]:
"""Enforces the grammar rules for SELFIES Branch symbols.
def next_ring_state(
ring_type: int, state: int
) -> Tuple[int, Optional[int]]:
assert state > 0
Given the branch symbol and current derivation state, retrieves
the initial branch derivation state (i.e. the derivation state that the
new branch begins on), and the next derivation state (i.e. the derivation
state after the branch is created).
bond_order = min(ring_type, state)
bonds_left = state - bond_order
next_state = None if (bonds_left == 0) else bonds_left
return bond_order, next_state
:param branch_symbol: the branch symbol (e.g. [Branch1_2], [Branch3_1])
:param state: the current derivation state.
:return: a tuple of (1) the initial branch state, and
(2) the next derivation state.
"""
branch_type = int(branch_symbol[-2]) # branches of the form [BranchL_X]
if not (1 <= branch_type <= 3):
raise ValueError("unknown branch symbol '{}'".format(branch_symbol))
if 2 <= state <= 8:
branch_init_state = min(state - 1, branch_type)
next_state = state - branch_init_state
return branch_init_state, next_state
else:
return -1, state
# SELFIES Symbol to N Functions ============================================
_index_alphabet = ['[C]', '[Ring1]', '[Ring2]',
'[Branch1_1]', '[Branch1_2]', '[Branch1_3]',
'[Branch2_1]', '[Branch2_2]', '[Branch2_3]',
'[O]', '[N]', '[=N]', '[=C]', '[#C]', '[S]', '[P]']
# _alphabet_code takes as a key a SELFIES symbol, and its corresponding value
# is the index of the key.
_alphabet_code = {c: i for i, c in enumerate(_index_alphabet)}
def get_n_from_symbols(*symbols: List[str]) -> int:
"""Computes N from a list of SELFIES symbols.
Converts a list of SELFIES symbols [c_1, ..., c_n] into a number N.
This is done by converting each symbol c_n to an integer idx(c_n) via
``_alphabet_code``, and then treating the list as a number in base
len(_alphabet_code). If a symbol is unrecognized, it is given value 0 by
default.
:param symbols: a list of SELFIES symbols.
:return: the corresponding N for ``symbols``.
"""
N = 0
def get_index_from_selfies(*symbols: List[str]) -> int:
index = 0
for i, c in enumerate(reversed(symbols)):
N_i = _alphabet_code.get(c, 0) * (len(_alphabet_code) ** i)
N += N_i
return N
index += INDEX_CODE.get(c, 0) * (len(INDEX_CODE) ** i)
return index
def get_symbols_from_n(n: int) -> List[str]:
"""Converts an integer n into a list of SELFIES symbols that, if
passed into ``get_n_from_symbols`` in that order, would have produced n.
def get_selfies_from_index(index: int) -> List[str]:
if index < 0:
raise IndexError()
elif index == 0:
return [INDEX_ALPHABET[0]]
:param n: an integer from 0 to 4095 inclusive.
:return: a list of SELFIES symbols representing n in base
``len(_alphabet_code)``.
"""
if n == 0:
return [_index_alphabet[0]]
symbols = []
base = len(_index_alphabet)
while n:
symbols.append(_index_alphabet[n % base])
n //= base
base = len(INDEX_ALPHABET)
while index:
symbols.append(INDEX_ALPHABET[index % base])
index //= base
return symbols[::-1]
# Helper Functions ============================================================
# =============================================================================
# Caches (for computational speed)
# =============================================================================
def get_num_from_bond(bond_symbol: str) -> int:
"""Retrieves the bond multiplicity from a SMILES symbol representing
a bond. If ``bond_symbol`` is not known, 1 is returned by default.
SELFIES_ATOM_PATTERN = re.compile(
r"^[\[]" # opening square bracket [
r"([=#/\\]?)" # bond char
r"(\d*)" # isotope number (optional, e.g. 123, 26)
r"([A-Z][a-z]?)" # element symbol
r"([@]{0,2})" # chiral_tag (optional, only @ and @@ supported)
r"((?:[H]\d)?)" # H count (optional, e.g. H1, H3)
r"((?:[+-][1-9]+)?)" # charge (optional, e.g. +1)
r"[]]$" # closing square bracket ]
)
:param bond_symbol: a SMILES symbol representing a bond.
:return: the bond multiplicity of ``bond_symbol``, or 1 if
``bond_symbol`` is not recognized.
"""
if bond_symbol == "=":
return 2
elif bond_symbol == "#":
return 3
else:
return 1
def _process_atom_selfies_no_cache(symbol):
m = SELFIES_ATOM_PATTERN.match(symbol)
if m is None:
return None
bond_char, isotope, element, chirality, h_count, charge = m.groups()
if symbol[1 + len(bond_char):-1] in ORGANIC_SUBSET:
atom_fac = functools.partial(Atom, element=element, is_aromatic=False)
return smiles_to_bond(bond_char), atom_fac
def get_bond_from_num(n: int) -> str:
"""Returns the SMILES symbol representing a bond with multiplicity
``n``. More specifically, ``'' = 1`` and ``'=' = 2`` and ``'#' = 3``.
isotope = None if (isotope == "") else int(isotope)
if element not in ELEMENTS:
return None
chirality = None if (chirality == "") else chirality
:param n: either 1, 2, 3.
:return: the SMILES symbol representing a bond with multiplicity ``n``.
"""
s = h_count
if s == "":
h_count = 0
else:
h_count = int(s[1:])
return ('', '=', '#')[n - 1]
s = charge
if s == "":
charge = 0
else:
charge = int(s[1:])
charge *= 1 if (s[0] == "+") else -1
atom_fac = functools.partial(
Atom,
element=element,
is_aromatic=False,
isotope=isotope,
chirality=chirality,
h_count=h_count,
charge=charge
)
def find_element(atom_symbol: str) -> Tuple[int, int]:
"""Returns the indices of the element component of a SMILES atom symbol.
return smiles_to_bond(bond_char), atom_fac
That is, if atom_symbol[i:j] is the element substring of the SMILES atom,
then (i, j) is returned. For example:
* _find_element('b') = (0, 1).
* _find_element('B') = (0, 1).
* _find_element('[13C]') = (3, 4).
* _find_element('[nH+]') = (1, 2).
:param atom_symbol: a SMILES atom.
:return: a tuple of the indices of the element substring of
``atom_symbol``.
"""
def _build_atom_cache():
cache = dict()
common_symbols = [
"[#C+1]", "[#C-1]", "[#C]", "[#N+1]", "[#N]", "[#O+1]", "[#P+1]",
"[#P-1]", "[#P]", "[#S+1]", "[#S-1]", "[#S]", "[=C+1]", "[=C-1]",
"[=C]", "[=N+1]", "[=N-1]", "[=N]", "[=O+1]", "[=O]", "[=P+1]",
"[=P-1]", "[=P]", "[=S+1]", "[=S-1]", "[=S]", "[Br]", "[C+1]", "[C-1]",
"[C]", "[Cl]", "[F]", "[H]", "[I]", "[N+1]", "[N-1]", "[N]", "[O+1]",
"[O-1]", "[O]", "[P+1]", "[P-1]", "[P]", "[S+1]", "[S-1]", "[S]"
]
if atom_symbol[0] != '[':
return 0, len(atom_symbol)
for symbol in common_symbols:
cache[symbol] = _process_atom_selfies_no_cache(symbol)
return cache
i = 1
while atom_symbol[i].isdigit(): # skip isotope number
i += 1
if atom_symbol[i + 1].isalpha() and atom_symbol[i + 1] != 'H':
return i, i + 2
else:
return i, i + 1
def _build_branch_cache():
cache = dict()
for L in range(1, 4):
for bond_char in ["", "=", "#"]:
symbol = "[{}Branch{}]".format(bond_char, L)
cache[symbol] = (smiles_to_bond(bond_char)[0], L)
return cache
def parse_atom_symbol(atom_symbol: str) -> Tuple[str, int, int]:
"""Parses a SMILES atom symbol and returns its element component,
number of associated hydrogens, and charge.
def _build_ring_cache():
cache = dict()
for L in range(1, 4):
# [RingL], [=RingL], [#RingL]
for bond_char in ["", "=", "#"]:
symbol = "[{}Ring{}]".format(bond_char, L)
order, stereo = smiles_to_bond(bond_char)
cache[symbol] = (order, L, (stereo, stereo))
See http://opensmiles.org/opensmiles.html for the formal grammar
of SMILES atom symbols. Note that only @ and @@ are currently supported
as chiral specifications.
# [-/RingL], [\/RingL], [\-RingL], ...
for lchar, rchar in itertools.product(["-", "/", "\\"], repeat=2):
if lchar == rchar == "-":
continue
symbol = "[{}{}Ring{}]".format(lchar, rchar, L)
order, lstereo = smiles_to_bond(lchar)
order, rstereo = smiles_to_bond(rchar)
cache[symbol] = (order, L, (lstereo, rstereo))
return cache
:param atom_symbol: a SMILES atom symbol.
:return: a tuple of (1) the element of ``atom_symbol``, (2) the hydrogen
count, and (3) the charge.
"""
if atom_symbol[0] != '[':
return atom_symbol, 0, 0
_PROCESS_ATOM_CACHE = _build_atom_cache()
atom_start, atom_end = find_element(atom_symbol)
i = atom_end
_PROCESS_BRANCH_CACHE = _build_branch_cache()
# skip chirality
if atom_symbol[i] == '@': # e.g. @
i += 1
if atom_symbol[i] == '@': # e.g. @@
i += 1
h_count = 0 # hydrogen count
if atom_symbol[i] == 'H':
h_count = 1
i += 1
if atom_symbol[i].isdigit(): # e.g. [CH2]
h_count = int(atom_symbol[i])
i += 1
charge = 0 # charge count
if atom_symbol[i] in ('+', '-'):
charge = 1 if atom_symbol[i] == '+' else -1
i += 1
if atom_symbol[i] in ('+', '-'): # e.g. [Cu++]
while atom_symbol[i] in ('+', '-'):
charge += (1 if atom_symbol[i] == '+' else -1)
i += 1
elif atom_symbol[i].isdigit(): # e.g. [Cu+2]
s = i
while atom_symbol[i].isdigit():
i += 1
charge *= int(atom_symbol[s:i])
return atom_symbol[atom_start: atom_end], h_count, charge
_PROCESS_RING_CACHE = _build_ring_cache()

@@ -10,4 +10,4 @@ #!/usr/bin/env python

name="selfies",
version="1.0.4",
author="Mario Krenn",
version="2.0.0",
author="Mario Krenn, Alston Lo, and many other contributors",
author_email="mario.krenn@utoronto.ca, alan@aspuru.com",

@@ -14,0 +14,0 @@ description="SELFIES (SELF-referencIng Embedded Strings) is a "

from typing import Dict, Iterable, List, Set, Tuple, Union
from selfies.grammar_rules import find_element, get_num_from_bond, \
parse_atom_symbol
ATOM_TYPE = 1
BRANCH_TYPE = 2
RING_TYPE = 3
def kekulize_parser(smiles_gen: Iterable[Tuple[str, str, int]]) \
-> Iterable[Tuple[str, str, int]]:
"""Kekulizes a SMILES in the form of an iterable.
This method intercepts the output of ``encoder._parse_smiles``, and
acts as filter that kekulizes the SMILES. The motivation for having
this setup is that string parsing and concatenation is minimized,
as the parsing is already done by ``_parse_smiles``.
Reference: https://depth-first.com/articles/2020/02/10/a-comprehensive
-treatment-of-aromaticity-in-the-smiles-language/
:param smiles_gen: an iterator returned by ``encoder._parse_smiles``.
:return: an iterator representing the kekulized SMILES, in the same
format as that returned by ``encoder._parse_smiles``.
"""
# save to list, so the iterator can be used across multiple functions
# change elements from tuple -> list to allow in-place modifications
smiles_symbols = list(map(list, smiles_gen))
mol_graph = MolecularGraph(smiles_symbols)
rings = {}
_build_molecular_graph(mol_graph, smiles_symbols, rings)
if mol_graph.aro_indices:
_kekulize(mol_graph)
for x in mol_graph.smiles_symbols: # return as iterator
yield tuple(x)
def _build_molecular_graph(graph,
smiles_symbols: List[List[Union[str, int]]],
rings: Dict[int, Tuple[int, int]],
prev_idx: int = -1,
curr_idx: int = -1) -> int:
"""From the iterator returned by ``encoder._parse_smiles``, builds
a graph representation of the molecule.
This is done by iterating through ``smiles_symbols``, and then adding bonds
to the molecular graph. Note that ``smiles_symbols`` is mutated in this
method, for convenience.
:param graph: the MolecularGraph to be added to.
:param smiles_symbols: a list created from the iterator returned
by ``encoder._parse_smiles``.
:param rings: an, initially, empty dictionary used to keep track of
rings to be made.
:param prev_idx:
:param curr_idx:
:return: the last index of ``smiles_symbols`` that was processed.
"""
while curr_idx + 1 < len(smiles_symbols):
curr_idx += 1
_, symbol, symbol_type = smiles_symbols[curr_idx]
if symbol_type == ATOM_TYPE:
if prev_idx >= 0:
graph.add_bond(prev_idx, curr_idx, curr_idx)
prev_idx = curr_idx
elif symbol_type == BRANCH_TYPE:
if symbol == '(':
curr_idx = _build_molecular_graph(graph, smiles_symbols, rings,
prev_idx, curr_idx)
else:
break
else:
if symbol in rings:
left_idx, left_bond_idx = rings.pop(symbol)
right_idx, right_bond_idx = prev_idx, curr_idx
# we mutate one bond index to be '', so that we
# can faithfully represent the bond to be localized at
# one index. For example, C=1CCCC=1 --> C1CCCC=1.
if smiles_symbols[left_bond_idx][0] != '':
bond_idx = left_bond_idx
smiles_symbols[right_bond_idx][0] = ''
else:
bond_idx = right_bond_idx
smiles_symbols[left_bond_idx][0] = ''
graph.add_bond(left_idx, right_idx, bond_idx)
else:
rings[symbol] = (prev_idx, curr_idx)
return curr_idx
def _kekulize(mol_graph) -> None:
"""Kekulizes the molecular graph.
:param mol_graph: a molecular graph to be kekulized.
:return: None.
"""
mol_graph.prune_to_pi_subgraph()
visited = set()
for i in mol_graph.get_nodes_by_num_edges():
success = mol_graph.dfs_assign_bonds(i, visited, set(), set())
if not success:
raise ValueError("kekulization algorithm failed")
mol_graph.write_to_smiles_symbols()
# Aromatic Helper Methods and Classes
# key = aromatic SMILES element, value = number of valence electrons
# Note: wild card '*' not supported currently
_aromatic_valences = {
'b': 3, 'al': 3, 'c': 4, 'si': 4, 'n': 5, 'p': 5,
'as': 5, 'o': 6, 's': 6, 'se': 6, 'te': 6
}
def _capitalize(atom_symbol: str) -> str:
"""Capitalizes the element portion of an aromatic SMILES atom symbol,
converting it into a standard SMILES atom symbol.
:param atom_symbol: an aromatic SMILES atom symbol.
:return: the capitalized ``atom_symbol``.
"""
s, _ = find_element(atom_symbol)
return atom_symbol[:s] + atom_symbol[s].upper() + atom_symbol[s + 1:]
def _is_aromatic(atom_symbol: str) -> bool:
"""Checks whether a SMILES atom symbol is an aromatic SMILES atom symbol.
An aromatic SMILES atom symbol is indicated by an element substring
that is not capitalized.
:param atom_symbol: a SMILES atom symbol.
:return: True, if ``atom_symbol`` is an aromatic atom symbol,
and False otherwise.
"""
s, e = find_element(atom_symbol)
if e == len(atom_symbol): # optimization to prevent string copying
element = atom_symbol
else:
element = atom_symbol[s: e]
if element[0].isupper(): # check if element is capitalized
return False
if element not in _aromatic_valences:
raise ValueError("unrecognized aromatic symbol '{}'"
.format(atom_symbol))
return True
def _in_pi_subgraph(atom_symbol: str, bonds: Tuple[str]) -> bool:
"""Checks whether a SMILES atom symbol should be a node in the pi
subgraph, based on its bonds.
More specifically, an atom should be a node in the pi subgraph if it has
an unpaired valence electron, and thus, is able to make a double bond.
Reference: https://depth-first.com/articles/2020/02/10/a-comprehensive
-treatment-of-aromaticity-in-the-smiles-language/
:param atom_symbol: a SMILES atom symbol representing an atom.
:param bonds: the bonds connected to ``atom_symbol``.
:return: True if ``atom_symbol`` should be included in the pi subgraph,
and False otherwise.
"""
atom, h_count, charge = parse_atom_symbol(atom_symbol)
used_electrons = 0
for b in bonds:
used_electrons += get_num_from_bond(b)
# e.g. c1ccccc1
# this also covers the neutral carbon radical case (e.g. C1=[C]NC=C1),
# which is treated equivalently to a 1-H carbon (e.g. C1=[CH]NC=C1)
if (atom == 'c') and (h_count == charge == 0) \
and (len(bonds) == 2) and ('#' not in bonds):
h_count += 1 # implied bonded hydrogen
if h_count > 1:
raise ValueError("unrecognized aromatic symbol '{}'"
.format(atom_symbol))
elif h_count == 1: # e.g. [nH]
used_electrons += 1
valence = _aromatic_valences[atom] - charge
free_electrons = valence - used_electrons
return free_electrons % 2 != 0
class MolecularGraph:
"""A molecular graph.
This molecular graph operates based on the ``smiles_symbols`` data
structure. Indices from this list represent nodes or edges, depending
on whether they point to a SMILES atom(s) or bond.
:ivar smiles_symbols: the list created from the iterator returned by
``encoder._parse_smiles``. Serves as the base data structure
of this class, as everything is communicated through indices
referring to elements of this list.
:ivar graph: the key is an index of the atom(s) from ``smiles_symbols``.
The value is a list of Bond objects representing the connected
bonds. Represents the actual molecular graph.
:ivar aro_indices: a set of indices of atom(s) from ``smiles_symbols``
that are aromatic in the molecular graph.
"""
def __init__(self, smiles_symbols: List[List[Union[str, int]]]):
self.smiles_symbols = smiles_symbols
self.graph = {}
self.aro_indices = set()
def get_atom_symbol(self, idx: int) -> str:
"""Getter that returns the SMILES symbol representing an atom
at a specified index.
:param idx: an index in ``smiles_symbols``.
:return: the SMILES symbol representing an atom at index
``idx`` in ``smiles_symbols``.
"""
return self.smiles_symbols[idx][1]
def get_bond_symbol(self, idx: int) -> str:
"""Getter that returns the SMILES symbol representing a bond at
a specified index.
:param idx: an index in ``smiles_symbols``.
:return: the SMILES symbol representing a bond at index
``idx`` in ``smiles_symbols``.
"""
return self.smiles_symbols[idx][0]
def get_nodes_by_num_edges(self) -> List[int]:
"""Returns all nodes (or indices) stored in this molecular graph
in a semi-sorted order by number of edges.
This is to optimize the speed of ``dfs_assign_bonds``; starting
with nodes that have fewer edges will improve computational time
as there are fewer bond configurations to explore. Instead of fully
sorting the returned list, a compromise is made, and nodes with exactly
one edge are added to the list's beginning.
:return: a list of the nodes (or indices) of this molecular graph,
semi-sorted by number of edges.
"""
ends = [] # nodes with exactly 1 edge
middles = [] # nodes with 2+ edges
for idx, edges in self.graph.items():
if len(edges) > 1:
middles.append(idx)
else:
ends.append(idx)
ends.extend(middles)
return ends
def set_atom_symbol(self, atom_symbol: str, idx: int) -> None:
"""Setter that updates the SMILES symbol representing an atom(s) at
a specified index.
:param atom_symbol: the new value of the atom symbol at ``idx``.
:param idx: an index in ``smiles_symbols``.
:return: None.
"""
self.smiles_symbols[idx][1] = atom_symbol
def set_bond_symbol(self, bond_symbol: str, idx: int) -> None:
"""Setter that updates the SMILES symbol representing a bond at
a specified index.
:param bond_symbol: the new value of the bond symbol at ``idx``.
:param idx: an index in ``smiles_symbols``.
:return: None.
"""
self.smiles_symbols[idx][0] = bond_symbol
def add_bond(self, idx_a: int, idx_b: int, bond_idx: int) -> None:
"""Adds a bond (or edge) to this molecular graph between atoms
(or nodes) at two specified indices.
:param idx_a: the index of one atom (or node) of this bond.
:param idx_b:the index of one atom (or node) of this bond.
:param bond_idx: the index of this bond.
:return: None.
"""
atom_a = self.get_atom_symbol(idx_a)
atom_b = self.get_atom_symbol(idx_b)
atom_a_aro = (idx_a in self.aro_indices) or _is_aromatic(atom_a)
atom_b_aro = (idx_b in self.aro_indices) or _is_aromatic(atom_b)
bond_symbol = self.get_bond_symbol(bond_idx)
if atom_a_aro:
self.aro_indices.add(idx_a)
if atom_b_aro:
self.aro_indices.add(idx_b)
if bond_symbol == ':':
self.aro_indices.add(idx_a)
self.aro_indices.add(idx_b)
# Note: ':' bonds are edited here to ''
self.set_bond_symbol('', bond_idx)
bond_symbol = ''
edge = Bond(idx_a, idx_b, bond_symbol, bond_idx)
self.graph.setdefault(idx_a, []).append(edge)
self.graph.setdefault(idx_b, []).append(edge)
def prune_to_pi_subgraph(self) -> None:
"""Removes nodes and edges from this molecular graph such that
it becomes the pi subgraph.
The remaining graph will only contain aromatic atoms (or nodes)
that belong in the pi-subgraph, and the bonds that are aromatic
and between such atoms.
:return: None.
"""
# remove non-aromatic nodes
non_aromatic = self.graph.keys() - self.aro_indices
for i in non_aromatic:
self.graph.pop(i)
# remove non-pi subgraph nodes
for i in self.aro_indices:
atom = self.get_atom_symbol(i)
bonds = tuple(edge.bond_symbol for edge in self.graph[i])
if not _in_pi_subgraph(atom, bonds):
self.graph.pop(i)
# remove irrelevant edges
for idx, edges in self.graph.items():
keep = list(filter(lambda e: (e.idx_a in self.graph)
and (e.idx_b in self.graph)
and (e.bond_symbol == ''),
edges))
self.graph[idx] = keep
def dfs_assign_bonds(self, idx: int,
visited: Set[int],
matched_nodes: Set[int],
matched_edges) -> bool:
"""After calling ``prune_to_pi_subgraph``, this method assigns
double bonds between pairs of nodes such that every node is
paired or matched.
This is done recursively in a depth-first search fashion.
:param idx: the index of the current atom (or node).
:param visited: a set of the indices of nodes that have been visited.
:param matched_nodes: a set of the indices of nodes that have been
matched, i.e., assigned a double bond.
:param matched_edges: a set of the bonds that have been matched.
:return: True, if a valid bond assignment was found; False otherwise.
"""
if idx in visited:
return True
edges = self.graph[idx]
if idx in matched_nodes:
# recursively try to match adjacent nodes. If the matching
# fails, then we must backtrack.
visited_save = visited.copy()
visited.add(idx)
for e in edges:
adj = e.other_end(idx)
if not self.dfs_assign_bonds(adj, visited,
matched_nodes,
matched_edges):
visited &= visited_save
return False
return True
else:
# list of candidate edges that can become a double bond
candidates = list(
filter(lambda i: i.other_end(idx) not in matched_nodes, edges)
)
if not candidates:
return False # idx is unmatched, but all adj nodes are matched
matched_edges_save = matched_edges.copy()
for e in candidates:
# match nodes connected by c
matched_nodes.add(e.idx_a)
matched_nodes.add(e.idx_b)
matched_edges.add(e)
success = self.dfs_assign_bonds(idx, visited,
matched_nodes,
matched_edges)
if success:
e.bond_symbol = '='
return True
else: # the matching failed, so we must backtrack
for edge in matched_edges - matched_edges_save:
edge.bond_symbol = ''
matched_nodes.discard(edge.idx_a)
matched_nodes.discard(edge.idx_b)
matched_edges &= matched_edges_save
return False
def write_to_smiles_symbols(self):
"""Updates and mutates ``self.smiles_symbols`` with the information
contained in ``self.graph``.
After kekulizing the molecular graph, this method is called to
merge the new information back into the original data structure.
:return: None.
"""
# capitalize aromatic molecules
for idx in self.aro_indices:
self.set_atom_symbol(_capitalize(self.get_atom_symbol(idx)), idx)
# write bonds
for edge_list in self.graph.values():
for edge in edge_list:
bond_symbol = edge.bond_symbol
bond_idx = edge.bond_idx
self.set_bond_symbol(bond_symbol, bond_idx)
# branches record the next symbol as their bond, so we
# must update accordingly
if (bond_idx > 0) and \
(self.smiles_symbols[bond_idx - 1][2] == BRANCH_TYPE):
self.set_bond_symbol(bond_symbol, bond_idx - 1)
class Bond:
"""Represents a bond or edge in MolecularGraph.
Recall that the following indices are with respect to ``smiles_symbols``
in MolecularGraph.
:ivar idx_a: the index of one atom or node of this bond.
:ivar idx_b: the index of one atom or node of this bond.
:ivar bond_symbol: the SMILES symbol representing this bond (e.g. '#').
:ivar bond_idx: the index of this bond or edge.
"""
def __init__(self, idx_a, idx_b, bond_symbol, bond_idx):
self.idx_a = idx_a
self.idx_b = idx_b
self.bond_symbol = bond_symbol
self.bond_idx = bond_idx
def __eq__(self, other):
if isinstance(other, type(self)):
return (self.idx_a, self.idx_b) == (other.idx_a, other.idx_b)
return NotImplemented
def __hash__(self):
return hash((self.idx_a, self.idx_b))
def other_end(self, idx):
"""Given an index representing one end of this bond, returns
the index representing the other end.
:param idx: an index of one atom or node of this bond.
:return: the index of the other atom or node of this bond, or
None if ``idx`` is an invalid input.
"""
if idx == self.idx_a:
return self.idx_b
elif idx == self.idx_b:
return self.idx_a
return None
from typing import Dict, Iterable, List, Set, Tuple, Union
def len_selfies(selfies: str) -> int:
"""Computes the symbol length of a SELFIES.
The symbol length is the number of symbols that make up the SELFIES,
and not the length of the string itself (i.e. ``len(selfies)``).
:param selfies: a SELFIES.
:return: the symbol length of ``selfies``.
:Example:
>>> import selfies
>>> selfies.len_selfies('[C][O][C]')
3
>>> selfies.len_selfies('[C][=C][F].[C]')
5
"""
return selfies.count("[") + selfies.count(".")
def split_selfies(selfies: str) -> Iterable[str]:
"""Splits a SELFIES into its symbols.
Returns an iterable that yields the symbols of a SELFIES one-by-one
in the order they appear in the string. SELFIES symbols are always
either indicated by an open and closed square bracket, or are the ``'.'``
dot-bond symbol.
:param selfies: the SELFIES to be read.
:return: an iterable of the symbols of ``selfies`` in the same order
they appear in the string.
:Example:
>>> import selfies
>>> list(selfies.split_selfies('[C][O][C]'))
['[C]', '[O]', '[C]']
>>> list(selfies.split_selfies('[C][=C][F].[C]'))
['[C]', '[=C]', '[F]', '.', '[C]']
"""
left_idx = selfies.find("[")
while 0 <= left_idx < len(selfies):
right_idx = selfies.find("]", left_idx + 1)
next_symbol = selfies[left_idx: right_idx + 1]
yield next_symbol
left_idx = right_idx + 1
if selfies[left_idx: left_idx + 1] == ".":
yield "."
left_idx += 1
def get_alphabet_from_selfies(selfies_iter: Iterable[str]) -> Set[str]:
"""Constructs an alphabet from an iterable of SELFIES.
From an iterable of SELFIES, constructs the minimum-sized set
of SELFIES symbols such that every SELFIES in the iterable can be
constructed from symbols from that set. Then, the set is returned.
Note that the symbol ``'.'`` will not be added as a member of the
returned set, even if it appears in the input.
:param selfies_iter: an iterable of SELFIES.
:return: the SElFIES alphabet built from the SELFIES in ``selfies_iter``.
:Example:
>>> import selfies
>>> selfies_list = ['[C][F][O]', '[C].[O]', '[F][F]']
>>> alphabet = selfies.get_alphabet_from_selfies(selfies_list)
>>> sorted(list(alphabet))
['[C]', '[F]', '[O]']
"""
alphabet = set()
for s in selfies_iter:
for symbol in split_selfies(s):
alphabet.add(symbol)
alphabet.discard(".")
return alphabet
def selfies_to_encoding(
selfies: str,
vocab_stoi: Dict[str, int],
pad_to_len: int = -1,
enc_type: str = 'both'
) -> Union[List[int], List[List[int]], Tuple[List[int], List[List[int]]]]:
"""Converts a SELFIES into its label (integer) and/or one-hot encoding.
A label encoded output will be a list of size ``(N,)`` and a
one-hot encoded output will be a list of size ``(N, len(vocab_stoi))``;
where ``N`` is the symbol length of the (potentially padded) SELFIES.
Note that SELFIES uses the special padding symbol ``[nop]``.
:param selfies: the SELFIES to be encoded.
:param vocab_stoi: a dictionary that maps SELFIES symbols (the keys)
to a non-negative index. The indices of the dictionary
must contiguous, starting from 0.
:param pad_to_len: the length the SELFIES is be padded to.
If ``pad_to_len`` is less than or equal to the symbol
length of the SELFIES, then no padding is added. Defaults to ``-1``.
:param enc_type: the type of encoding of the output:
``label`` or ``one_hot`` or ``both``.
If the value is ``both``, then a tuple of the label and one-hot
encoding are returned (in that order). Defaults to ``both``.
:return: the label encoded and/or one-hot encoded SELFIES.
:Example:
>>> import selfies as sf
>>> sf.selfies_to_encoding('[C][F]', {'[C]': 0, '[F]': 1})
([0, 1], [[1, 0], [0, 1]])
"""
# some error checking
if enc_type not in ('label', 'one_hot', 'both'):
raise ValueError("enc_type must be in ('label', 'one_hot', 'both')")
# pad with [nop]
if pad_to_len > len_selfies(selfies):
selfies += "[nop]" * (pad_to_len - len_selfies(selfies))
# integer encode
char_list = split_selfies(selfies)
integer_encoded = [vocab_stoi[char] for char in char_list]
if enc_type == 'label':
return integer_encoded
# one-hot encode
onehot_encoded = list()
for index in integer_encoded:
letter = [0] * len(vocab_stoi)
letter[index] = 1
onehot_encoded.append(letter)
if enc_type == 'one_hot':
return onehot_encoded
return integer_encoded, onehot_encoded
def encoding_to_selfies(
encoded: Union[List[int], List[List[int]]],
vocab_itos: Dict[int, str],
enc_type: str,
) -> str:
"""Converts a label (integer) or one-hot encoded list into
a SELFIES string.
If the input is label encoded, then a list of size ``(N,)`` is
expected; and if the input is one-hot encoded, then a 2D list of
size ``(N, len(vocab_itos))`` is expected.
:param encoded: a label or one-hot encoded list.
:param vocab_itos: a dictionary that maps non-negative indices (the keys)
to SELFIES symbols. The indices of the dictionary
must be contiguous, starting from 0.
:param enc_type: the type of encoding of the output:
``label`` or ``one_hot``.
:return: the SELFIES string represented by the encoded input.
:Example:
>>> import selfies as sf
>>> one_hot = [[0, 1, 0], [0, 0, 1], [1, 0, 0]]
>>> vocab_itos = {0: '[nop]', 1: '[C]', 2: '[F]'}
>>> sf.encoding_to_selfies(one_hot, vocab_itos, enc_type='one_hot')
'[C][F][nop]'
"""
if enc_type not in ('label', 'one_hot'):
raise ValueError("enc_type must be in ('label', 'one_hot')")
if enc_type == 'one_hot': # Get integer encoding
integer_encoded = []
for row in encoded:
integer_encoded.append(row.index(1))
else:
integer_encoded = encoded
# Integer encoding -> SELFIES
char_list = [vocab_itos[i] for i in integer_encoded]
selfies = "".join(char_list)
return selfies
def batch_selfies_to_flat_hot(
selfies_batch: List[str],
vocab_stoi: Dict[str, int],
pad_to_len: int = -1,
) -> List[List[int]]:
"""Converts a list of SELFIES into a list of
flattened one-hot encodings.
Returned is a list of size ``(batch_size, N * len(vocab_stoi))``;
where ``N`` is the symbol length of the (potentially padded) SELFIES.
Note that SELFIES uses the special padding symbol ``[nop]``.
:param selfies_batch: a list of SELFIES to be converted.
:param vocab_stoi: a dictionary that maps SELFIES symbols (the keys)
to a non-negative index. The indices of the dictionary
must contiguous, starting from 0.
:param pad_to_len: the length that each SELFIES is be padded to.
If ``pad_to_len`` is less than or equal to the symbol
length of the SELFIES, then no padding is added. Defaults to ``-1``.
:return: the flattened one-hot encoded representations of the SELFIES
from the batch. This is a 2D list of size
``(batch_size, N * len(vocab_stoi))``.
:Example:
>>> import selfies as sf
>>> batch = ["[C]", "[C][C]"]
>>> vocab_stoi = {'[nop]': 0, '[C]': 1}
>>> sf.batch_selfies_to_flat_hot(batch, vocab_stoi, 2)
[[0, 1, 1, 0], [0, 1, 0, 1]]
"""
hot_list = list()
for selfies in selfies_batch:
one_hot = selfies_to_encoding(selfies, vocab_stoi, pad_to_len,
enc_type='one_hot')
flattened = [elem for vec in one_hot for elem in vec]
hot_list.append(flattened)
return hot_list
def batch_flat_hot_to_selfies(
one_hot_batch: List[List[int]],
vocab_itos: Dict[int, str],
) -> List[str]:
"""Convert a batch of flattened one-hot encodings into
a list of SELFIES.
We expect ``one_hot_batch`` to be a list of size ``(batch_size, S)``,
where ``S`` is divisible by the length of the vocabulary.
:param one_hot_batch: a list of flattened one-hot encoded representations.
:param vocab_itos: a dictionary that maps non-negative indices (the keys)
to SELFIES symbols. We expect the indices of the dictionary
to be contiguous and starting from 0.
:return: a list of SELFIES strings.
:Example:
>>> import selfies as sf
>>> batch = [[0, 1, 1, 0], [0, 1, 0, 1]]
>>> vocab_itos = {0: '[nop]', 1: '[C]'}
>>> sf.batch_flat_hot_to_selfies(batch, vocab_itos)
['[C][nop]', '[C][C]']
"""
selfies_list = []
for flat_one_hot in one_hot_batch:
# Reshape to an N x M array where each column represents an alphabet
# entry and each row is a position in the selfies
one_hot = []
M = len(vocab_itos)
if len(flat_one_hot) % M != 0:
raise ValueError("size of vector in one_hot_batch not divisible "
"by the length of the vocabulary.")
N = len(flat_one_hot) // M
for i in range(N):
one_hot.append(flat_one_hot[M * i: M * (i + 1)])
selfies = encoding_to_selfies(one_hot, vocab_itos, enc_type='one_hot')
selfies_list.append(selfies)
return selfies_list