selfies
Advanced tools
| import functools | ||
| from itertools import product | ||
| from typing import Dict, Set, Union | ||
| from selfies.constants import ELEMENTS, INDEX_ALPHABET | ||
| _DEFAULT_CONSTRAINTS = { | ||
| "H": 1, "F": 1, "Cl": 1, "Br": 1, "I": 1, | ||
| "B": 3, "B+1": 2, "B-1": 4, | ||
| "O": 2, "O+1": 3, "O-1": 1, | ||
| "N": 3, "N+1": 4, "N-1": 2, | ||
| "C": 4, "C+1": 5, "C-1": 3, | ||
| "P": 5, "P+1": 6, "P-1": 4, | ||
| "S": 6, "S+1": 7, "S-1": 5, | ||
| "?": 8 | ||
| } | ||
| _PRESET_CONSTRAINTS = { | ||
| "default": dict(_DEFAULT_CONSTRAINTS), | ||
| "octet_rule": dict(_DEFAULT_CONSTRAINTS), | ||
| "hypervalent": dict(_DEFAULT_CONSTRAINTS) | ||
| } | ||
| _PRESET_CONSTRAINTS["octet_rule"].update( | ||
| {"S": 2, "S+1": 3, "S-1": 1, "P": 3, "P+1": 4, "P-1": 2} | ||
| ) | ||
| _PRESET_CONSTRAINTS["hypervalent"].update( | ||
| {"Cl": 7, "Br": 7, "I": 7, "N": 5} | ||
| ) | ||
| _current_constraints = _PRESET_CONSTRAINTS["default"] | ||
| def get_preset_constraints(name: str) -> Dict[str, int]: | ||
| """Returns the preset semantic constraints with the given name. | ||
| Besides the aforementioned default constraints, :mod:`selfies` offers | ||
| other preset constraints for convenience; namely, constraints that | ||
| enforce the `octet rule <https://en.wikipedia.org/wiki/Octet_rule>`_ | ||
| and constraints that accommodate `hypervalent molecules | ||
| <https://en.wikipedia.org/wiki/Hypervalent_molecule>`_. | ||
| The differences between these constraints can be summarized as follows: | ||
| .. table:: | ||
| :align: center | ||
| :widths: auto | ||
| +-----------------+-----------+---+---+-----+-----+---+-----+-----+ | ||
| | | Cl, Br, I | N | P | P+1 | P-1 | S | S+1 | S-1 | | ||
| +-----------------+-----------+---+---+-----+-----+---+-----+-----+ | ||
| | ``default`` | 1 | 3 | 5 | 6 | 4 | 6 | 7 | 5 | | ||
| +-----------------+-----------+---+---+-----+-----+---+-----+-----+ | ||
| | ``octet_rule`` | 1 | 3 | 3 | 4 | 2 | 2 | 3 | 1 | | ||
| +-----------------+-----------+---+---+-----+-----+---+-----+-----+ | ||
| | ``hypervalent`` | 7 | 5 | 5 | 6 | 4 | 6 | 7 | 5 | | ||
| +-----------------+-----------+---+---+-----+-----+---+-----+-----+ | ||
| :param name: the preset name: ``default`` or ``octet_rule`` or | ||
| ``hypervalent``. | ||
| :return: the preset constraints with the specified name, represented | ||
| as a dictionary which maps atoms (the keys) to their bonding capacities | ||
| (the values). | ||
| """ | ||
| if name not in _PRESET_CONSTRAINTS: | ||
| raise ValueError("unrecognized preset name '{}'".format(name)) | ||
| return dict(_PRESET_CONSTRAINTS[name]) | ||
| def get_semantic_constraints() -> Dict[str, int]: | ||
| """Returns the semantic constraints that :mod:`selfies` is currently | ||
| operating on. | ||
| :return: the current semantic constraints, represented as a dictionary | ||
| which maps atoms (the keys) to their bonding capacities (the values). | ||
| """ | ||
| global _current_constraints | ||
| return dict(_current_constraints) | ||
| def set_semantic_constraints( | ||
| bond_constraints: Union[str, Dict[str, int]] = "default" | ||
| ) -> None: | ||
| """Updates the semantic constraints that :mod:`selfies` operates on. | ||
| If the input is a string, the new constraints are taken to be | ||
| the preset named ``bond_constraints`` | ||
| (see :func:`selfies.get_preset_constraints`). | ||
| Otherwise, the input is a dictionary representing the new constraints. | ||
| This dictionary maps atoms (the keys) to non-negative bonding | ||
| capacities (the values); the atoms are specified by strings | ||
| of the form ``E`` or ``E+C`` or ``E-C``, | ||
| where ``E`` is an element symbol and ``C`` is a positive integer. | ||
| For example, one may have: | ||
| * ``bond_constraints["I-1"] = 0`` | ||
| * ``bond_constraints["C"] = 4`` | ||
| This dictionary must also contain the special ``?`` key, which indicates | ||
| the bond capacities of all atoms that are not explicitly listed | ||
| in the dictionary. | ||
| :param bond_constraints: the name of a preset, or a dictionary | ||
| representing the new semantic constraints. | ||
| :return: ``None``. | ||
| """ | ||
| global _current_constraints | ||
| if isinstance(bond_constraints, str): | ||
| _current_constraints = get_preset_constraints(bond_constraints) | ||
| elif isinstance(bond_constraints, dict): | ||
| # error checking | ||
| if "?" not in bond_constraints: | ||
| raise ValueError("bond_constraints missing '?' as a key") | ||
| for key, value in bond_constraints.items(): | ||
| # error checking for keys | ||
| j = max(key.find("+"), key.find("-")) | ||
| if key == "?": | ||
| valid = True | ||
| elif j == -1: | ||
| valid = (key in ELEMENTS) | ||
| else: | ||
| valid = (key[:j] in ELEMENTS) and key[j + 1:].isnumeric() | ||
| if not valid: | ||
| err_msg = "invalid key '{}' in bond_constraints".format(key) | ||
| raise ValueError(err_msg) | ||
| # error checking for values | ||
| if not (isinstance(value, int) and value >= 0): | ||
| err_msg = "invalid value at " \ | ||
| "bond_constraints['{}'] = {}".format(key, value) | ||
| raise ValueError(err_msg) | ||
| _current_constraints = dict(bond_constraints) | ||
| else: | ||
| raise ValueError("bond_constraints must be a str or dict") | ||
| # clear cache since we changed alphabet | ||
| get_semantic_robust_alphabet.cache_clear() | ||
| get_bonding_capacity.cache_clear() | ||
| @functools.lru_cache() | ||
| def get_semantic_robust_alphabet() -> Set[str]: | ||
| """Returns a subset of all SELFIES symbols that are constrained | ||
| by :mod:`selfies` under the current semantic constraints. | ||
| :return: a subset of all SELFIES symbols that are semantically constrained. | ||
| """ | ||
| alphabet_subset = set() | ||
| bonds = {"": 1, "=": 2, "#": 3} | ||
| # add atomic symbols | ||
| for (a, c), (b, m) in product(_current_constraints.items(), bonds.items()): | ||
| if (m > c) or (a == "?"): | ||
| continue | ||
| symbol = "[{}{}]".format(b, a) | ||
| alphabet_subset.add(symbol) | ||
| # add branch and ring symbols | ||
| for i in range(1, 4): | ||
| alphabet_subset.add("[Ring{}]".format(i)) | ||
| alphabet_subset.add("[=Ring{}]".format(i)) | ||
| alphabet_subset.add("[Branch{}]".format(i)) | ||
| alphabet_subset.add("[=Branch{}]".format(i)) | ||
| alphabet_subset.add("[#Branch{}]".format(i)) | ||
| alphabet_subset.update(INDEX_ALPHABET) | ||
| return alphabet_subset | ||
| @functools.lru_cache() | ||
| def get_bonding_capacity(element: str, charge: int) -> int: | ||
| """Returns the bonding capacity of a given atom, under the current | ||
| semantic constraints. | ||
| :param element: the element of the input atom. | ||
| :param charge: the charge of the input atom. | ||
| :return: the bonding capacity of the input atom. | ||
| """ | ||
| key = element | ||
| if charge != 0: | ||
| key += "{:+}".format(charge) | ||
| if key in _current_constraints: | ||
| return _current_constraints[key] | ||
| else: | ||
| return _current_constraints["?"] |
| from selfies.utils.smiles_utils import atom_to_smiles, smiles_to_atom | ||
| def modernize_symbol(symbol): | ||
| """Converts a SELFIES symbol from <v2 to its latest equivalent. | ||
| :param symbol: an old SELFIES symbol. | ||
| :return: the latest equivalent of the input symbol, or the input symbol | ||
| itself, if no such equivalent exists. | ||
| """ | ||
| if symbol in _SYMBOL_UPDATE_TABLE: | ||
| return _SYMBOL_UPDATE_TABLE[symbol] | ||
| if symbol[-5:] == "expl]": # e.g. [XXXexpl] | ||
| if symbol[1] in "=#/\\": | ||
| bond_char, atom_symbol = symbol[1], symbol[2:-5] | ||
| else: | ||
| bond_char, atom_symbol = "", symbol[1:-5] | ||
| atom = smiles_to_atom("[{}]".format(atom_symbol)) | ||
| if (atom is not None) and (not atom.is_aromatic): | ||
| atom_symbol = atom_to_smiles(atom, brackets=False) # standardize | ||
| symbol = "[{}{}]".format(bond_char, atom_symbol) | ||
| return symbol | ||
| def _build_update_table(): | ||
| update_table = dict() | ||
| for L in range(1, 4): | ||
| entries = [ | ||
| ("[Branch{}_1]", "[Branch{}]"), | ||
| ("[Branch{}_2]", "[=Branch{}]"), | ||
| ("[Branch{}_3]", "[#Branch{}]"), | ||
| ("[Expl=Ring{}]", "[=Ring{}]"), | ||
| ("[Expl#Ring{}]", "[#Ring{}]"), | ||
| ("[Expl/Ring{}]", "[//Ring{}]"), | ||
| ("[Expl\\Ring{}]", "[\\\\Ring{}]") | ||
| ] | ||
| for old, new in entries: | ||
| update_table[old.format(L)] = new.format(L) | ||
| return update_table | ||
| _SYMBOL_UPDATE_TABLE = _build_update_table() |
| ELEMENTS = { | ||
| "H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg", | ||
| "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca", "Sc", "Ti", "V", "Cr", | ||
| "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", | ||
| "Kr", "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", | ||
| "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe", "Cs", "Ba", "Hf", | ||
| "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", | ||
| "Po", "At", "Rn", "Fr", "Ra", "Rf", "Db", "Sg", "Bh", "Hs", "Mt", | ||
| "Ds", "Rg", "Cn", "Fl", "Lv", "La", "Ce", "Pr", "Nd", "Pm", "Sm", | ||
| "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Ac", "Th", | ||
| "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", | ||
| "No", "Lr" | ||
| } | ||
| ORGANIC_SUBSET = {"B", "C", "N", "O", "S", "P", "F", "Cl", "Br", "I"} | ||
| AROMATIC_VALENCES = { | ||
| "B": (3,), "Al": (3,), | ||
| "C": (4,), "Si": (4,), | ||
| "N": (3, 5), "P": (3, 5), "As": (3, 5), | ||
| "O": (2, 4), "S": (2, 4), "Se": (2, 4), "Te": (2, 4) | ||
| } | ||
| AROMATIC_SUBSET = set(e.lower() for e in AROMATIC_VALENCES) | ||
| # ============================================================================= | ||
| # SELFIES-specific constants | ||
| # ============================================================================= | ||
| INDEX_ALPHABET = ( | ||
| "[C]", "[Ring1]", "[Ring2]", | ||
| "[Branch1]", "[=Branch1]", "[#Branch1]", | ||
| "[Branch2]", "[=Branch2]", "[#Branch2]", | ||
| "[O]", "[N]", "[=N]", "[=C]", "[#C]", "[S]", "[P]" | ||
| ) | ||
| INDEX_CODE = {c: i for i, c in enumerate(INDEX_ALPHABET)} |
| class SMILESParserError(ValueError): | ||
| """Exception raised when a SMILES fails to be parsed. | ||
| """ | ||
| def __init__(self, smiles, reason="N/A", idx=-1): | ||
| self.smiles = smiles | ||
| self.idx = idx | ||
| self.reason = reason | ||
| def __str__(self): | ||
| err_msg = "\n" \ | ||
| "\tSMILES: {smiles}\n" \ | ||
| "\t {pointer}\n" \ | ||
| "\tIndex: {index}\n" \ | ||
| "\tReason: {reason}" | ||
| return err_msg.format( | ||
| smiles=self.smiles, | ||
| pointer=(" " * self.idx + "^"), | ||
| index=self.idx, | ||
| reason=self.reason | ||
| ) | ||
| class EncoderError(Exception): | ||
| """Exception raised by :func:`selfies.encoder`. | ||
| """ | ||
| pass | ||
| class DecoderError(Exception): | ||
| """Exception raised by :func:`selfies.decoder`. | ||
| """ | ||
| pass |
| import functools | ||
| import itertools | ||
| from typing import List, Optional, Union | ||
| from selfies.bond_constraints import get_bonding_capacity | ||
| from selfies.constants import AROMATIC_VALENCES | ||
| from selfies.utils.matching_utils import find_perfect_matching | ||
| class Atom: | ||
| """An atom with associated specifications (e.g. charge, chirality). | ||
| """ | ||
| def __init__( | ||
| self, | ||
| element: str, | ||
| is_aromatic: bool, | ||
| isotope: Optional[int] = None, | ||
| chirality: Optional[str] = None, | ||
| h_count: Optional[int] = None, | ||
| charge: int = 0 | ||
| ): | ||
| self.index = None | ||
| self.element = element | ||
| self.is_aromatic = is_aromatic | ||
| self.isotope = isotope | ||
| self.chirality = chirality | ||
| self.h_count = h_count | ||
| self.charge = charge | ||
| @property | ||
| @functools.lru_cache() | ||
| def bonding_capacity(self): | ||
| bond_cap = get_bonding_capacity(self.element, self.charge) | ||
| bond_cap -= 0 if (self.h_count is None) else self.h_count | ||
| return bond_cap | ||
| def invert_chirality(self) -> None: | ||
| if self.chirality == "@": | ||
| self.chirality = "@@" | ||
| elif self.chirality == "@@": | ||
| self.chirality = "@" | ||
| class DirectedBond: | ||
| """A bond that contains directional information. | ||
| """ | ||
| def __init__( | ||
| self, | ||
| src: int, | ||
| dst: int, | ||
| order: Union[int, float], | ||
| stereo: Optional[str], | ||
| ring_bond: bool | ||
| ): | ||
| self.src = src | ||
| self.dst = dst | ||
| self.order = order | ||
| self.stereo = stereo | ||
| self.ring_bond = ring_bond | ||
| class MolecularGraph: | ||
| """A molecular graph. | ||
| Molecules can be viewed as weighted undirected graphs. However, SMILES | ||
| and SELFIES strings are more naturally represented as weighted directed | ||
| graphs, where the direction of the edges specifies the order of atoms | ||
| and bonds in the string. | ||
| """ | ||
| def __init__(self): | ||
| self._roots = list() # stores root atoms, where traversal begins | ||
| self._atoms = list() # stores atoms in this graph | ||
| self._bond_dict = dict() # stores all bonds in this graph | ||
| self._adj_list = list() # adjacency list, representing this graph | ||
| self._bond_counts = list() # stores number of bonds an atom has made | ||
| self._ring_bond_flags = list() # stores if an atom makes a ring bond | ||
| self._delocal_subgraph = dict() # delocalization subgraph | ||
| def __len__(self): | ||
| return len(self._atoms) | ||
| def has_bond(self, a: int, b: int) -> bool: | ||
| if a > b: | ||
| a, b = b, a | ||
| return (a, b) in self._bond_dict | ||
| def has_out_ring_bond(self, src: int) -> bool: | ||
| return self._ring_bond_flags[src] | ||
| def get_roots(self) -> List[int]: | ||
| return self._roots | ||
| def get_atom(self, idx: int) -> Atom: | ||
| return self._atoms[idx] | ||
| def get_atoms(self) -> List[Atom]: | ||
| return self._atoms | ||
| def get_dirbond(self, src, dst) -> DirectedBond: | ||
| return self._bond_dict[(src, dst)] | ||
| def get_out_dirbonds(self, src: int) -> List[DirectedBond]: | ||
| return self._adj_list[src] | ||
| def get_bond_count(self, idx: int) -> int: | ||
| return self._bond_counts[idx] | ||
| def add_atom(self, atom: Atom, mark_root: bool = False) -> None: | ||
| atom.index = len(self) | ||
| if mark_root: | ||
| self._roots.append(atom.index) | ||
| self._atoms.append(atom) | ||
| self._adj_list.append(list()) | ||
| self._bond_counts.append(0) | ||
| self._ring_bond_flags.append(False) | ||
| if atom.is_aromatic: | ||
| self._delocal_subgraph[atom.index] = list() | ||
| def add_bond( | ||
| self, src: int, dst: int, | ||
| order: Union[int, float], stereo: str | ||
| ) -> None: | ||
| assert src < dst | ||
| bond = DirectedBond(src, dst, order, stereo, False) | ||
| self._add_bond_at_loc(bond, -1) | ||
| self._bond_counts[src] += order | ||
| self._bond_counts[dst] += order | ||
| if order == 1.5: | ||
| self._delocal_subgraph.setdefault(src, []).append(dst) | ||
| self._delocal_subgraph.setdefault(dst, []).append(src) | ||
| def add_placeholder_bond(self, src: int) -> int: | ||
| out_edges = self._adj_list[src] | ||
| out_edges.append(None) | ||
| return len(out_edges) - 1 | ||
| def add_ring_bond( | ||
| self, a: int, b: int, | ||
| order: Union[int, float], | ||
| a_stereo: Optional[str], b_stereo: Optional[str], | ||
| a_pos: int = -1, b_pos: int = -1 | ||
| ) -> None: | ||
| a_bond = DirectedBond(a, b, order, a_stereo, True) | ||
| b_bond = DirectedBond(b, a, order, b_stereo, True) | ||
| self._add_bond_at_loc(a_bond, a_pos) | ||
| self._add_bond_at_loc(b_bond, b_pos) | ||
| self._bond_counts[a] += order | ||
| self._bond_counts[b] += order | ||
| self._ring_bond_flags[a] = True | ||
| self._ring_bond_flags[b] = True | ||
| if order == 1.5: | ||
| self._delocal_subgraph.setdefault(a, []).append(b) | ||
| self._delocal_subgraph.setdefault(b, []).append(a) | ||
| def update_bond_order( | ||
| self, a: int, b: int, | ||
| new_order: Union[int, float] | ||
| ) -> None: | ||
| assert 1 <= new_order <= 3 | ||
| if a > b: | ||
| a, b = b, a # swap so that a < b | ||
| a_to_b = self._bond_dict[(a, b)] # prev step guarantees existence | ||
| if new_order == a_to_b.order: | ||
| return | ||
| elif a_to_b.ring_bond: | ||
| b_to_a = self._bond_dict[(b, a)] | ||
| bonds = (a_to_b, b_to_a) | ||
| else: | ||
| bonds = (a_to_b,) | ||
| old_order = bonds[0].order | ||
| for bond in bonds: | ||
| bond.order = new_order | ||
| self._bond_counts[a] += (new_order - old_order) | ||
| self._bond_counts[b] += (new_order - old_order) | ||
| def _add_bond_at_loc(self, bond, pos): | ||
| self._bond_dict[(bond.src, bond.dst)] = bond | ||
| out_edges = self._adj_list[bond.src] | ||
| if (pos == -1) or (pos == len(out_edges)): | ||
| out_edges.append(bond) | ||
| elif out_edges[pos] is None: | ||
| out_edges[pos] = bond | ||
| else: | ||
| out_edges.insert(pos, bond) | ||
| def is_kekulized(self) -> bool: | ||
| return not self._delocal_subgraph | ||
| def kekulize(self) -> bool: | ||
| # Algorithm based on Depth-First article by Richard L. Apodaca | ||
| # Reference: | ||
| # https://depth-first.com/articles/2020/02/10/ | ||
| # a-comprehensive-treatment-of-aromaticity-in-the-smiles-language/ | ||
| if self.is_kekulized(): | ||
| return True | ||
| ds = self._delocal_subgraph | ||
| kept_nodes = set(itertools.filterfalse(self._prune_from_ds, ds)) | ||
| # relabel kept DS nodes to be 0, 1, 2, ... | ||
| label_to_node = list(sorted(kept_nodes)) | ||
| node_to_label = {v: i for i, v in enumerate(label_to_node)} | ||
| # pruned and relabelled DS | ||
| pruned_ds = [list() for _ in range(len(kept_nodes))] | ||
| for node in kept_nodes: | ||
| label = node_to_label[node] | ||
| for adj in filter(lambda v: v in kept_nodes, ds[node]): | ||
| pruned_ds[label].append(node_to_label[adj]) | ||
| matching = find_perfect_matching(pruned_ds) | ||
| if matching is None: | ||
| return False | ||
| # de-aromatize and then make double bonds | ||
| for node in ds: | ||
| for adj in ds[node]: | ||
| self.update_bond_order(node, adj, new_order=1) | ||
| self._atoms[node].is_aromatic = False | ||
| self._bond_counts[node] = int(self._bond_counts[node]) | ||
| for matched_labels in enumerate(matching): | ||
| matched_nodes = tuple(label_to_node[i] for i in matched_labels) | ||
| self.update_bond_order(*matched_nodes, new_order=2) | ||
| self._delocal_subgraph = dict() # clear DS | ||
| return True | ||
| def _prune_from_ds(self, node): | ||
| adj_nodes = self._delocal_subgraph[node] | ||
| if not adj_nodes: | ||
| return True # aromatic atom with no aromatic bonds | ||
| atom = self._atoms[node] | ||
| valences = AROMATIC_VALENCES[atom.element] | ||
| # each bond in DS has order 1.5 - we treat them as single bonds | ||
| used_electrons = int(self._bond_counts[node] - 0.5 * len(adj_nodes)) | ||
| if atom.h_count is None: # account for implicit Hs | ||
| assert atom.charge == 0 | ||
| return any(used_electrons == v for v in valences) | ||
| else: | ||
| valence = valences[-1] - atom.charge | ||
| used_electrons += atom.h_count | ||
| free_electrons = valence - used_electrons | ||
| return not ((free_electrons >= 0) and (free_electrons % 2 != 0)) |
| from typing import Dict, List, Tuple, Union | ||
| from selfies.utils.selfies_utils import len_selfies, split_selfies | ||
| def selfies_to_encoding( | ||
| selfies: str, | ||
| vocab_stoi: Dict[str, int], | ||
| pad_to_len: int = -1, | ||
| enc_type: str = 'both' | ||
| ) -> Union[List[int], List[List[int]], Tuple[List[int], List[List[int]]]]: | ||
| """Converts a SELFIES string into its label (integer) | ||
| and/or one-hot encoding. | ||
| A label encoded output will be a list of shape ``(L,)`` and a | ||
| one-hot encoded output will be a 2D list of shape ``(L, len(vocab_stoi))``, | ||
| where ``L`` is the symbol length of the SELFIES string. Optionally, | ||
| the SELFIES string can be padded before it is encoded. | ||
| :param selfies: the SELFIES string to be encoded. | ||
| :param vocab_stoi: a dictionary that maps SELFIES symbols to indices, | ||
| which must be non-negative and contiguous, starting from 0. | ||
| If the SELFIES string is to be padded, then the special padding symbol | ||
| ``[nop]`` must also be a key in this dictionary. | ||
| :param pad_to_len: the length that the SELFIES string string is padded to. | ||
| If this value is less than or equal to the symbol length of the | ||
| SELFIES string, then no padding is added. Defaults to ``-1``. | ||
| :param enc_type: the type of encoding of the output: | ||
| ``label`` or ``one_hot`` or ``both``. | ||
| If this value is ``both``, then a tuple of the label and one-hot | ||
| encodings is returned. Defaults to ``both``. | ||
| :return: the label encoded and/or one-hot encoded SELFIES string. | ||
| :Example: | ||
| >>> import selfies as sf | ||
| >>> sf.selfies_to_encoding("[C][F]", {"[C]": 0, "[F]": 1}) | ||
| ([0, 1], [[1, 0], [0, 1]]) | ||
| """ | ||
| # some error checking | ||
| if enc_type not in ("label", "one_hot", "both"): | ||
| raise ValueError("enc_type must be in ('label', 'one_hot', 'both')") | ||
| # pad with [nop] | ||
| if pad_to_len > len_selfies(selfies): | ||
| selfies += "[nop]" * (pad_to_len - len_selfies(selfies)) | ||
| # integer encode | ||
| char_list = split_selfies(selfies) | ||
| integer_encoded = [vocab_stoi[char] for char in char_list] | ||
| if enc_type == "label": | ||
| return integer_encoded | ||
| # one-hot encode | ||
| one_hot_encoded = list() | ||
| for index in integer_encoded: | ||
| letter = [0] * len(vocab_stoi) | ||
| letter[index] = 1 | ||
| one_hot_encoded.append(letter) | ||
| if enc_type == "one_hot": | ||
| return one_hot_encoded | ||
| return integer_encoded, one_hot_encoded | ||
| def encoding_to_selfies( | ||
| encoding: Union[List[int], List[List[int]]], | ||
| vocab_itos: Dict[int, str], | ||
| enc_type: str, | ||
| ) -> str: | ||
| """Converts a label (integer) or one-hot encoding into a SELFIES string. | ||
| If the input is label encoded, then a list of shape ``(L,)`` is | ||
| expected; and if the input is one-hot encoded, then a 2D list of | ||
| shape ``(L, len(vocab_itos))`` is expected. | ||
| :param encoding: a label or one-hot encoding. | ||
| :param vocab_itos: a dictionary that maps indices to SELFIES symbols. | ||
| The indices of this dictionary must be non-negative and contiguous, | ||
| starting from 0. | ||
| :param enc_type: the type of encoding of the input: | ||
| ``label`` or ``one_hot``. | ||
| :return: the SELFIES string represented by the input encoding. | ||
| :Example: | ||
| >>> import selfies as sf | ||
| >>> one_hot = [[0, 1, 0], [0, 0, 1], [1, 0, 0]] | ||
| >>> vocab_itos = {0: "[nop]", 1: "[C]", 2: "[F]"} | ||
| >>> sf.encoding_to_selfies(one_hot, vocab_itos, enc_type="one_hot") | ||
| '[C][F][nop]' | ||
| """ | ||
| if enc_type not in ("label", "one_hot"): | ||
| raise ValueError("enc_type must be in ('label', 'one_hot')") | ||
| if enc_type == "one_hot": # Get integer encoding | ||
| integer_encoded = [] | ||
| for row in encoding: | ||
| integer_encoded.append(row.index(1)) | ||
| else: | ||
| integer_encoded = encoding | ||
| # Integer encoding -> SELFIES | ||
| char_list = [vocab_itos[i] for i in integer_encoded] | ||
| selfies = "".join(char_list) | ||
| return selfies | ||
| def batch_selfies_to_flat_hot( | ||
| selfies_batch: List[str], | ||
| vocab_stoi: Dict[str, int], | ||
| pad_to_len: int = -1, | ||
| ) -> List[List[int]]: | ||
| """Converts a list of SELFIES strings into its list of flattened | ||
| one-hot encodings. | ||
| Each SELFIES string in the input list is one-hot encoded | ||
| (and then flattened) using :func:`selfies.selfies_to_encoding`, with | ||
| ``vocab_stoi`` and ``pad_to_len`` being passed in as arguments. | ||
| :param selfies_batch: the list of SELFIES strings to be encoded. | ||
| :param vocab_stoi: a dictionary that maps SELFIES symbols to indices. | ||
| :param pad_to_len: the length that each SELFIES string in the input list | ||
| is padded to. Defaults to ``-1``. | ||
| :return: the flattened one-hot encodings of the input list. | ||
| :Example: | ||
| >>> import selfies as sf | ||
| >>> batch = ["[C]", "[C][C]"] | ||
| >>> vocab_stoi = {"[nop]": 0, "[C]": 1} | ||
| >>> sf.batch_selfies_to_flat_hot(batch, vocab_stoi, 2) | ||
| [[0, 1, 1, 0], [0, 1, 0, 1]] | ||
| """ | ||
| hot_list = list() | ||
| for selfies in selfies_batch: | ||
| one_hot = selfies_to_encoding(selfies, vocab_stoi, pad_to_len, | ||
| enc_type="one_hot") | ||
| flattened = [elem for vec in one_hot for elem in vec] | ||
| hot_list.append(flattened) | ||
| return hot_list | ||
| def batch_flat_hot_to_selfies( | ||
| one_hot_batch: List[List[int]], | ||
| vocab_itos: Dict[int, str], | ||
| ) -> List[str]: | ||
| """Converts a list of flattened one-hot encodings into a list | ||
| of SELFIES strings. | ||
| Each encoding in the input list is unflattened and then decoded using | ||
| :func:`selfies.encoding_to_selfies`, with ``vocab_itos`` being passed in | ||
| as an argument. | ||
| :param one_hot_batch: a list of flattened one-hot encodings. Each | ||
| encoding must be a list of length divisible by ``len(vocab_itos)``. | ||
| :param vocab_itos: a dictionary that maps indices to SELFIES symbols. | ||
| :return: the list of SELFIES strings represented by the input encodings. | ||
| :Example: | ||
| >>> import selfies as sf | ||
| >>> batch = [[0, 1, 1, 0], [0, 1, 0, 1]] | ||
| >>> vocab_itos = {0: "[nop]", 1: "[C]"} | ||
| >>> sf.batch_flat_hot_to_selfies(batch, vocab_itos) | ||
| ['[C][nop]', '[C][C]'] | ||
| """ | ||
| selfies_list = [] | ||
| for flat_one_hot in one_hot_batch: | ||
| # Reshape to an L x M array where each column represents an alphabet | ||
| # entry and each row is a position in the selfies | ||
| one_hot = [] | ||
| M = len(vocab_itos) | ||
| if len(flat_one_hot) % M != 0: | ||
| raise ValueError("size of vector in one_hot_batch not divisible " | ||
| "by the length of the vocabulary.") | ||
| L = len(flat_one_hot) // M | ||
| for i in range(L): | ||
| one_hot.append(flat_one_hot[M * i: M * (i + 1)]) | ||
| selfies = encoding_to_selfies(one_hot, vocab_itos, enc_type="one_hot") | ||
| selfies_list.append(selfies) | ||
| return selfies_list |
| from typing import Any | ||
| class SinglyLinkedList: | ||
| """A simple singly linked list that supports O(1) append and O(1) extend. | ||
| """ | ||
| def __init__(self): | ||
| self._head = None | ||
| self._tail = None | ||
| self._count = 0 | ||
| def __len__(self): | ||
| return self._count | ||
| def __iter__(self): | ||
| return SinglyLinkedListIterator(self) | ||
| @property | ||
| def head(self): | ||
| return self._head | ||
| def append(self, item: Any) -> None: | ||
| node = [item, None] | ||
| if self._head is None: | ||
| self._head = node | ||
| self._tail = node | ||
| else: | ||
| self._tail[1] = node | ||
| self._tail = node | ||
| self._count += 1 | ||
| def extend(self, other) -> None: | ||
| assert isinstance(other, SinglyLinkedList) | ||
| if other._head is None: | ||
| return | ||
| if self._head is None: | ||
| self._head = other._head | ||
| self._tail = other._tail | ||
| else: | ||
| self._tail[1] = other._head | ||
| self._tail = other._tail | ||
| self._count += len(other) | ||
| class SinglyLinkedListIterator: | ||
| def __init__(self, linked_list): | ||
| self._curr = linked_list.head | ||
| def __iter__(self): | ||
| return self | ||
| def __next__(self): | ||
| if self._curr is None: | ||
| raise StopIteration | ||
| else: | ||
| item = self._curr[0] | ||
| self._curr = self._curr[1] | ||
| return item |
| import heapq | ||
| import itertools | ||
| from collections import deque | ||
| from typing import List, Optional | ||
| def find_perfect_matching(graph: List[List[int]]) -> Optional[List[int]]: | ||
| """Finds a perfect matching for an undirected graph (without self-loops). | ||
| :param graph: an adjacency list representing the input graph. | ||
| :return: a list representing a perfect matching, where j is the i-th | ||
| element if nodes i and j are matched. Returns None, if the graph cannot | ||
| be perfectly matched. | ||
| """ | ||
| # start with a maximal matching for efficiency | ||
| matching = _greedy_matching(graph) | ||
| unmatched = set(i for i in range(len(graph)) if matching[i] is None) | ||
| while unmatched: | ||
| # find augmenting path which starts at root | ||
| root = unmatched.pop() | ||
| path = _find_augmenting_path(graph, root, matching) | ||
| if path is None: | ||
| return None | ||
| else: | ||
| _flip_augmenting_path(matching, path) | ||
| unmatched.discard(path[0]) | ||
| unmatched.discard(path[-1]) | ||
| return matching | ||
| def _greedy_matching(graph): | ||
| matching = [None] * len(graph) | ||
| free_degrees = [len(graph[i]) for i in range(len(graph))] | ||
| # free_degrees[i] = number of unmatched neighbors for node i | ||
| # prioritize nodes with fewer unmatched neighbors | ||
| node_pqueue = [(free_degrees[i], i) for i in range(len(graph))] | ||
| heapq.heapify(node_pqueue) | ||
| while node_pqueue: | ||
| _, node = heapq.heappop(node_pqueue) | ||
| if (matching[node] is not None) or (free_degrees[node] == 0): | ||
| continue # node cannot be matched | ||
| # match node with first unmatched neighbor | ||
| mate = next(i for i in graph[node] if matching[i] is None) | ||
| matching[node] = mate | ||
| matching[mate] = node | ||
| for adj in itertools.chain(graph[node], graph[mate]): | ||
| free_degrees[adj] -= 1 | ||
| if (matching[adj] is None) and (free_degrees[adj] > 0): | ||
| heapq.heappush(node_pqueue, (free_degrees[adj], adj)) | ||
| return matching | ||
| def _find_augmenting_path(graph, root, matching): | ||
| assert matching[root] is None | ||
| # run modified BFS to find path from root to unmatched node | ||
| other_end = None | ||
| node_queue = deque([root]) | ||
| # parent BFS tree - None indicates an unvisited node | ||
| parents = [None] * len(graph) | ||
| parents[root] = [None, None] | ||
| while node_queue: | ||
| node = node_queue.popleft() | ||
| for adj in graph[node]: | ||
| if matching[adj] is None: # unmatched node | ||
| if adj != root: # augmenting path found! | ||
| parents[adj] = [node, adj] | ||
| other_end = adj | ||
| break | ||
| else: | ||
| adj_mate = matching[adj] | ||
| if parents[adj_mate] is None: # adj_mate not visited | ||
| parents[adj_mate] = [node, adj] | ||
| node_queue.append(adj_mate) | ||
| if other_end is not None: | ||
| break # augmenting path found! | ||
| if other_end is None: | ||
| return None | ||
| else: | ||
| path = [] | ||
| node = other_end | ||
| while node != root: | ||
| path.append(parents[node][1]) | ||
| path.append(parents[node][0]) | ||
| node = parents[node][0] | ||
| return path | ||
| def _flip_augmenting_path(matching, path): | ||
| for i in range(0, len(path), 2): | ||
| a, b = path[i], path[i + 1] | ||
| matching[a] = b | ||
| matching[b] = a |
| from typing import Iterable, Iterator, Set | ||
| def len_selfies(selfies: str) -> int: | ||
| """Returns the number of symbols in a given SELFIES string. | ||
| :param selfies: a SELFIES string. | ||
| :return: the symbol length of the SELFIES string. | ||
| :Example: | ||
| >>> import selfies as sf | ||
| >>> sf.len_selfies("[C][=C][F].[C]") | ||
| 5 | ||
| """ | ||
| return selfies.count("[") + selfies.count(".") | ||
| def split_selfies(selfies: str) -> Iterator[str]: | ||
| """Tokenizes a SELFIES string into its individual symbols. | ||
| :param selfies: a SELFIES string. | ||
| :return: the symbols of the SELFIES string one-by-one with order preserved. | ||
| :Example: | ||
| >>> import selfies as sf | ||
| >>> list(sf.split_selfies("[C][=C][F].[C]")) | ||
| ['[C]', '[=C]', '[F]', '.', '[C]'] | ||
| """ | ||
| left_idx = selfies.find("[") | ||
| while 0 <= left_idx < len(selfies): | ||
| right_idx = selfies.find("]", left_idx + 1) | ||
| if right_idx == -1: | ||
| raise ValueError("malformed SELFIES string, hanging '[' bracket") | ||
| next_symbol = selfies[left_idx: right_idx + 1] | ||
| yield next_symbol | ||
| left_idx = right_idx + 1 | ||
| if selfies[left_idx: left_idx + 1] == ".": | ||
| yield "." | ||
| left_idx += 1 | ||
| def get_alphabet_from_selfies(selfies_iter: Iterable[str]) -> Set[str]: | ||
| """Constructs an alphabet from an iterable of SELFIES strings. | ||
| The returned alphabet is the set of all symbols that appear in the | ||
| SELFIES strings from the input iterable, minus the dot ``.`` symbol. | ||
| :param selfies_iter: an iterable of SELFIES strings. | ||
| :return: an alphabet of SELFIES symbols, built from the input iterable. | ||
| :Example: | ||
| >>> import selfies as sf | ||
| >>> selfies_list = ["[C][F][O]", "[C].[O]", "[F][F]"] | ||
| >>> alphabet = sf.get_alphabet_from_selfies(selfies_list) | ||
| >>> sorted(list(alphabet)) | ||
| ['[C]', '[F]', '[O]'] | ||
| """ | ||
| alphabet = set() | ||
| for s in selfies_iter: | ||
| for symbol in split_selfies(s): | ||
| alphabet.add(symbol) | ||
| alphabet.discard(".") | ||
| return alphabet |
| import enum | ||
| import re | ||
| from collections import deque | ||
| from typing import Iterator, Optional, Tuple, Union | ||
| from selfies.constants import AROMATIC_SUBSET, ELEMENTS, ORGANIC_SUBSET | ||
| from selfies.exceptions import SMILESParserError | ||
| from selfies.mol_graph import Atom, DirectedBond, MolecularGraph | ||
| SMILES_BRACKETED_ATOM_PATTERN = re.compile( | ||
| r"^[\[]" # opening square bracket [ | ||
| r"(\d*)" # isotope number (optional, e.g. 123, 26) | ||
| r"([A-Za-z][a-z]?)" # element symbol | ||
| r"([@]{0,2})" # chiral_tag (optional, only @ and @@ supported) | ||
| r"((?:[H]\d?)?)" # H count (optional, e.g. H, H0, H3) | ||
| r"((?:[+]+|[-]+|[+-]\d+)?)" # charge (optional, e.g. ---, +1, ++) | ||
| r"((?:[:]\d+)?)" # atom class (optional, e.g. :12, :1) | ||
| r"[]]$" # closing square bracket ] | ||
| ) | ||
| SMILES_BOND_ORDERS = {"-": 1, "/": 1, "\\": 1, ":": 1.5, "=": 2, "#": 3} | ||
| SMILES_STEREO_BONDS = {"/", "\\"} | ||
| class SMILESTokenTypes(enum.Enum): | ||
| ATOM = 0 | ||
| BRANCH = 1 | ||
| RING = 2 | ||
| DOT = 3 | ||
| class SMILESToken: | ||
| """A token in a SMILES string, containing a symbol (atom, branch bracket, | ||
| ring number, dot) and its preceding bond, if it exists (e.g. =C, %12, #N). | ||
| """ | ||
| def __init__( | ||
| self, | ||
| bond_idx: Optional[int], | ||
| start_idx: int, end_idx: int, token_type: SMILESTokenTypes | ||
| ): | ||
| self.bond_idx = bond_idx | ||
| self.start_idx = start_idx | ||
| self.end_idx = end_idx | ||
| self.token_type = token_type | ||
| def extract_bond_char(self, smiles): | ||
| return None if (self.bond_idx is None) else smiles[self.bond_idx] | ||
| def extract_symbol(self, smiles): | ||
| return smiles[self.start_idx:self.end_idx] | ||
| def tokenize_smiles(smiles: str) -> Iterator[SMILESToken]: | ||
| """Splits a SMILES string into its tokens. | ||
| :param smiles: the input SMILES string. | ||
| :return: the tokens of the input SMILES one-by-one with order preserved. | ||
| """ | ||
| i = 0 | ||
| while i < len(smiles): | ||
| if smiles[i] == ".": | ||
| yield SMILESToken(None, i, i + 1, SMILESTokenTypes.DOT) | ||
| i += 1 | ||
| continue | ||
| if smiles[i] in SMILES_BOND_ORDERS: | ||
| bond_idx = i | ||
| i += 1 | ||
| else: | ||
| bond_idx = None | ||
| if i == len(smiles): | ||
| raise SMILESParserError(smiles, "hanging bond", i - 1) | ||
| elif smiles[i].isalpha(): # organic subset elements | ||
| if smiles[i: i + 2] in ("Br", "Cl"): # two-letter elements | ||
| token = SMILESToken(bond_idx, i, i + 2, SMILESTokenTypes.ATOM) | ||
| else: # one-letter elements (e.g. C, N, ...) | ||
| token = SMILESToken(bond_idx, i, i + 1, SMILESTokenTypes.ATOM) | ||
| elif smiles[i] == "[": # atoms encased in brackets (e.g. [NH]) | ||
| r_idx = smiles.find("]", i + 1) | ||
| if r_idx == -1: | ||
| raise SMILESParserError(smiles, "hanging bracket [", i) | ||
| token = SMILESToken(bond_idx, i, r_idx + 1, SMILESTokenTypes.ATOM) | ||
| elif smiles[i] in ("(", ")"): # open and closed branch brackets | ||
| if bond_idx is not None: | ||
| raise SMILESParserError(smiles, "hanging_bond", bond_idx) | ||
| token = SMILESToken(None, i, i + 1, SMILESTokenTypes.BRANCH) | ||
| elif smiles[i].isdigit(): # one-digit ring number | ||
| token = SMILESToken(bond_idx, i, i + 1, SMILESTokenTypes.RING) | ||
| elif smiles[i] == "%": # two-digit ring number (e.g. %12) | ||
| rnum = smiles[i + 1: i + 3] | ||
| if not (rnum.isnumeric() and len(rnum) == 2): | ||
| err_msg = "invalid ring number '%{}'".format(rnum) | ||
| raise SMILESParserError(smiles, err_msg, i) | ||
| token = SMILESToken(bond_idx, i, i + 3, SMILESTokenTypes.RING) | ||
| else: | ||
| err_msg = "unrecognized symbol '{}'".format(smiles[i]) | ||
| raise SMILESParserError(smiles, err_msg, i) | ||
| yield token | ||
| i = token.end_idx | ||
| # ============================================================================= | ||
| # SMILES -> Atom, Graph, etc. | ||
| # ============================================================================= | ||
| def smiles_to_atom(atom_symbol: str) -> Optional[Atom]: | ||
| """Reads an atom from its SMILES representation. | ||
| :param atom_symbol: a SMILES atom symbol. | ||
| :return: the atom that the input symbol represents. | ||
| """ | ||
| if atom_symbol[0] == "[" and atom_symbol[-1] == "]": | ||
| pass # continue below | ||
| elif atom_symbol in ORGANIC_SUBSET: # e.g. C, N, O, ... | ||
| return Atom(atom_symbol, False) | ||
| elif atom_symbol in AROMATIC_SUBSET: # e.g. c, n, o, ... | ||
| return Atom(atom_symbol.capitalize(), True) | ||
| else: | ||
| return None | ||
| # e.g. [C], [C@@H], [O-], ... | ||
| m = SMILES_BRACKETED_ATOM_PATTERN.match(atom_symbol) | ||
| if m is None: | ||
| return None | ||
| isotope, element, chirality, h_count, charge, _ = m.groups() | ||
| isotope = None if (isotope == "") else int(isotope) | ||
| is_aromatic = element.islower() and (element in AROMATIC_SUBSET) | ||
| element = element.capitalize() | ||
| if element not in ELEMENTS: | ||
| return None | ||
| chirality = None if (chirality == "") else chirality | ||
| s = h_count | ||
| if s == "": | ||
| h_count = 0 | ||
| else: | ||
| s = s[1:] # HXXX -> XXX | ||
| h_count = 1 if (s == "") else int(s) | ||
| s = charge | ||
| if s == "": | ||
| charge = 0 | ||
| else: | ||
| if s[-1].isdigit(): # (+/-)XXX | ||
| charge = int(s[1:]) | ||
| else: # +++... or ---.... | ||
| charge = len(s) | ||
| charge *= 1 if s[0] == "+" else -1 | ||
| return Atom( | ||
| element=element, | ||
| is_aromatic=is_aromatic, | ||
| isotope=isotope, | ||
| chirality=chirality, | ||
| h_count=h_count, | ||
| charge=charge | ||
| ) | ||
| def smiles_to_bond( | ||
| bond_char: Optional[str] | ||
| ) -> Tuple[Union[int, float], Optional[str]]: | ||
| """Reads a bond from its SMILES representation. | ||
| :param bond_char: a SMILES bond symbol. | ||
| :return: the order and stereochemical specification of the bond | ||
| that the input symbol represents. | ||
| """ | ||
| order = SMILES_BOND_ORDERS.get(bond_char, 1) | ||
| stereo = bond_char if (bond_char in SMILES_STEREO_BONDS) else None | ||
| return order, stereo | ||
| def smiles_to_mol(smiles: str) -> MolecularGraph: | ||
| """Reads a molecular graph from a SMILES string. | ||
| :param smiles: the input SMILES string. | ||
| :return: a molecular graph that the input SMILES string represents. | ||
| :raises SMILESParserError: if the input SMILES is invalid. | ||
| """ | ||
| if smiles == "": | ||
| raise SMILESParserError(smiles, "empty SMILES", 0) | ||
| mol = MolecularGraph() | ||
| tokens = deque(tokenize_smiles(smiles)) | ||
| while tokens: | ||
| _derive_mol_from_tokens(mol, smiles, tokens) | ||
| return mol | ||
| def _derive_mol_from_tokens(mol, smiles, tokens): | ||
| tok = None | ||
| prev_stack = deque() # keep track of previous atom on the current chain | ||
| branch_stack = deque() # keep track of open branches | ||
| ring_log = dict() # keep track of hanging ring numbers | ||
| chain_start = True | ||
| prev_stack.append(tok) | ||
| while tokens: | ||
| tok = tokens.popleft() | ||
| bond_char = tok.extract_bond_char(smiles) | ||
| symbol, symbol_type = tok.extract_symbol(smiles), tok.token_type | ||
| prev_atom = prev_stack[-1] | ||
| if symbol_type == SMILESTokenTypes.DOT: | ||
| break | ||
| elif symbol_type == SMILESTokenTypes.ATOM: | ||
| curr = smiles_to_atom(symbol) | ||
| if curr is None: | ||
| err_msg = "invalid atom symbol '{}'".format(symbol) | ||
| raise SMILESParserError(smiles, err_msg, tok.start_idx) | ||
| curr = _attach_atom(mol, bond_char, curr, prev_atom) | ||
| prev_stack.pop() | ||
| prev_stack.append(curr) | ||
| chain_start = False | ||
| elif chain_start: | ||
| err_msg = "SMILES chain begins with non-atom" | ||
| raise SMILESParserError(smiles, err_msg, tok.start_idx) | ||
| elif symbol_type == SMILESTokenTypes.BRANCH: | ||
| if symbol == "(": | ||
| branch_stack.append(tok) | ||
| prev_stack.append(prev_atom) | ||
| chain_start = True | ||
| else: | ||
| if not branch_stack: | ||
| err_msg = "hanging ')' bracket" | ||
| raise SMILESParserError(smiles, err_msg, tok.start_idx) | ||
| branch_stack.pop() | ||
| prev_stack.pop() | ||
| elif symbol_type == SMILESTokenTypes.RING: | ||
| if symbol not in ring_log: | ||
| lpos = mol.add_placeholder_bond(src=prev_atom.index) | ||
| ring_log[symbol] = (tok, prev_atom, lpos) | ||
| else: | ||
| ltoken, latom, lpos = ring_log.pop(symbol) | ||
| _make_ring_bonds( | ||
| mol=mol, smiles=smiles, | ||
| ltoken=ltoken, latom=latom, lpos=lpos, | ||
| rtoken=tok, ratom=prev_atom | ||
| ) | ||
| else: | ||
| # should not happen | ||
| raise Exception("invalid symbol type") | ||
| if len(mol) == 0: | ||
| err_idx = (len(smiles) if (tok is None) else tok.start_idx) - 1 | ||
| raise SMILESParserError(smiles, "empty SMILES fragment", err_idx) | ||
| if branch_stack: | ||
| err_idx = branch_stack[-1].start_idx | ||
| raise SMILESParserError(smiles, "hanging '(' bracket", err_idx) | ||
| if ring_log: | ||
| rnum, (tok, _, _) = list(ring_log.items())[-1] | ||
| err_msg = "hanging ring number '{}'".format(rnum) | ||
| raise SMILESParserError(smiles, err_msg, tok.start_idx) | ||
| def _attach_atom(mol, bond_char, atom, prev_atom): | ||
| is_root = (prev_atom is None) | ||
| mol.add_atom(atom, mark_root=is_root) | ||
| if not is_root: | ||
| src, dst = prev_atom.index, atom.index | ||
| order, stereo = smiles_to_bond(bond_char) | ||
| if prev_atom.is_aromatic and atom.is_aromatic and (bond_char is None): | ||
| order = 1.5 # handle implicit aromatic bonds, e.g. cc | ||
| mol.add_bond(src=src, dst=dst, order=order, stereo=stereo) | ||
| return atom | ||
| def _make_ring_bonds(mol, smiles, ltoken, latom, lpos, rtoken, ratom): | ||
| if mol.has_bond(latom.index, ratom.index): | ||
| err_msg = "ring bond specified between already-bonded atoms" | ||
| raise SMILESParserError(smiles, err_msg, ltoken.start_idx) | ||
| lbond_char = ltoken.extract_bond_char(smiles) | ||
| rbond_char = rtoken.extract_bond_char(smiles) | ||
| # checking that ring bonds match | ||
| bonds = (lbond_char, rbond_char) | ||
| if bonds[0] is None: | ||
| bonds = (bonds[1], bonds[0]) | ||
| # swap bonds so that if bonds[0] is None, then bonds[1] is None | ||
| if ((bonds[0] == bonds[1]) | ||
| or (bonds[1] is None) | ||
| or all(x in SMILES_STEREO_BONDS for x in bonds)): | ||
| pass | ||
| else: | ||
| err_msg = "mismatched ring bonds" | ||
| raise SMILESParserError(smiles, err_msg, ltoken.start_idx) | ||
| lorder, lstereo = smiles_to_bond(lbond_char) | ||
| rorder, rstereo = smiles_to_bond(rbond_char) | ||
| if latom.is_aromatic and ratom.is_aromatic and (bonds == (None, None)): | ||
| lorder = rorder = 1.5 # handle implicit aromatic bonds, e.g. c1ccccc1 | ||
| mol.add_ring_bond( | ||
| a=latom.index, a_stereo=lstereo, a_pos=lpos, | ||
| b=ratom.index, b_stereo=rstereo, | ||
| order=max(lorder, rorder) | ||
| ) | ||
| # ============================================================================= | ||
| # SMILES <- Atom, Graph, etc. | ||
| # ============================================================================= | ||
| def atom_to_smiles(atom: Atom, brackets: bool = True) -> str: | ||
| """Converts an atom into its SMILES representation. | ||
| :param atom: the input atom. | ||
| :param brackets: True, if brackets should be added around the returned | ||
| symbol (e.g. in the case of [C] or [C@@H]). Defaults to True. | ||
| :return: a SMILES symbol representing the input atom. | ||
| """ | ||
| assert not atom.is_aromatic | ||
| specs = (atom.isotope, atom.chirality, atom.h_count, atom.charge) | ||
| if specs == (None, None, None, 0): | ||
| return atom.element | ||
| else: | ||
| builder = [] | ||
| if brackets: | ||
| builder.append("[") | ||
| if atom.isotope is not None: | ||
| builder.append(str(atom.isotope)) | ||
| builder.append(atom.element) | ||
| if atom.chirality is not None: | ||
| builder.append(atom.chirality) | ||
| if atom.h_count != 0: | ||
| builder.append("H") | ||
| builder.append(str(atom.h_count)) | ||
| elif specs == (None, None, 0, 0) and (atom.element in ORGANIC_SUBSET): | ||
| builder.append("H0") | ||
| if atom.charge != 0: | ||
| builder.append("{:+}".format(atom.charge)) | ||
| if brackets: | ||
| builder.append("]") | ||
| return "".join(builder) | ||
| def bond_to_smiles(bond: DirectedBond) -> str: | ||
| """Converts a bond into its SMILES representation. | ||
| :param bond: the input bond. | ||
| :return: a SMILES symbol representing the input bond. | ||
| """ | ||
| if bond.order == 1: | ||
| return bond.stereo if (bond.stereo in SMILES_STEREO_BONDS) else "" | ||
| elif bond.order == 2: | ||
| return "=" | ||
| elif bond.order == 3: | ||
| return "#" | ||
| else: # this should never happen | ||
| raise ValueError() | ||
| def mol_to_smiles(mol: MolecularGraph) -> str: | ||
| """Converts a molecular graph into its SMILES representation, maintaining | ||
| the traversal order indicated by the input graph. | ||
| :param mol: the input molecule. | ||
| :return: a SMILES string representing the input molecule. | ||
| """ | ||
| assert mol.is_kekulized() | ||
| fragments = [] | ||
| ring_log = dict() | ||
| for root in mol.get_roots(): | ||
| derived = [] | ||
| _derive_smiles_from_fragment(derived, mol, root, ring_log) | ||
| fragments.append("".join(derived)) | ||
| return ".".join(fragments) | ||
| def _derive_smiles_from_fragment(derived, mol, root, ring_log): | ||
| curr_atom, curr = mol.get_atom(root), root | ||
| derived.append(atom_to_smiles(curr_atom)) | ||
| out_bonds = mol.get_out_dirbonds(curr) | ||
| for i, bond in enumerate(out_bonds): | ||
| if bond.ring_bond: | ||
| derived.append(bond_to_smiles(bond)) | ||
| ends = (min(bond.src, bond.dst), max(bond.src, bond.dst)) | ||
| rnum = ring_log.setdefault(ends, len(ring_log) + 1) | ||
| if rnum >= 10: | ||
| derived.append("%") | ||
| derived.append(str(rnum)) | ||
| else: | ||
| if i < len(out_bonds) - 1: | ||
| derived.append("(") | ||
| derived.append(bond_to_smiles(bond)) | ||
| _derive_smiles_from_fragment(derived, mol, bond.dst, ring_log) | ||
| if i < len(out_bonds) - 1: | ||
| derived.append(")") |
+89
-98
| Metadata-Version: 2.1 | ||
| Name: selfies | ||
| Version: 1.0.4 | ||
| Version: 2.0.0 | ||
| Summary: SELFIES (SELF-referencIng Embedded Strings) is a general-purpose, sequence-based, robust representation of semantically constrained graphs. | ||
| Home-page: https://github.com/aspuru-guzik-group/selfies | ||
| Author: Mario Krenn | ||
| Author: Mario Krenn, Alston Lo, and many other contributors | ||
| Author-email: mario.krenn@utoronto.ca, alan@aspuru.com | ||
@@ -16,20 +16,24 @@ License: UNKNOWN | ||
| [](https://GitHub.com/aspuru-guzik-group/selfies/issues/) | ||
| [](http://selfies.readthedocs.io/?badge=latest) | ||
| [](http://selfiesv2.readthedocs.io/?badge=latest) | ||
| [](https://GitHub.com/aspuru-guzik-group/selfies/graphs/contributors/) | ||
| **Self-Referencing Embedded Strings (SELFIES): A 100% robust molecular string representation**<br> | ||
| _Mario Krenn, Florian Haese, AkshatKumar Nigam, Pascal Friederich, Alan Aspuru-Guzik_<br> | ||
| [*Machine Learning: Science and Technology* **1**, 045024 (2020)](https://iopscience.iop.org/article/10.1088/2632-2153/aba947), [extensive blog post January 2021](https://aspuru.substack.com/p/molecular-graph-representations-and).<br> | ||
| [Talk on youtube about SELFIES](https://www.youtube.com/watch?v=CaIyUmfGXDk).<br> | ||
| Major contributors since v1.0.0: _[Alston Lo](https://github.com/aspuru-guzik-group/selfies/commits?author=alstonlo) and [Seyone Chithrananda](https://github.com/seyonechithrananda)_<br> | ||
| **Self-Referencing Embedded Strings (SELFIES): A 100% robust molecular string representation**\ | ||
| _Mario Krenn, Florian Haese, AkshatKumar Nigam, Pascal Friederich, Alan Aspuru-Guzik_\ | ||
| [*Machine Learning: Science and Technology* **1**, 045024 (2020)](https://iopscience.iop.org/article/10.1088/2632-2153/aba947), [extensive blog post January 2021](https://aspuru.substack.com/p/molecular-graph-representations-and).\ | ||
| [Talk on youtube about SELFIES](https://www.youtube.com/watch?v=CaIyUmfGXDk).\ | ||
| [Blog explaining SELFIES in Japanese language](https://blacktanktop.hatenablog.com/entry/2021/08/12/115613)\ | ||
| Major contributors since v1.0.0: _[Alston Lo](https://github.com/alstonlo) and [Seyone Chithrananda](https://github.com/seyonechithrananda)_\ | ||
| Chemistry Advisor: [Robert Pollice](https://scholar.google.at/citations?user=JR2N3JIAAAAJ) | ||
| A main objective is to use SELFIES as direct input into machine learning models,<br> | ||
| in particular in generative models, for the generation of molecular graphs<br> | ||
| --- | ||
| A main objective is to use SELFIES as direct input into machine learning models, | ||
| in particular in generative models, for the generation of molecular graphs | ||
| which are syntactically and semantically valid. | ||
| <center><img src="https://github.com/aspuru-guzik-group/selfies/blob/master/examples/VAE_LS_Validity.png" alt="SELFIES validity in a VAE latent space" width="666px"></center> | ||
| <p align="center"> | ||
| <img src="https://github.com/aspuru-guzik-group/selfies/blob/master/examples/VAE_LS_Validity.png" alt="SELFIES validity in a VAE latent space" width="666px"> | ||
| </p> | ||
| ## Installation | ||
@@ -52,3 +56,3 @@ Use pip to install ``selfies``. | ||
| [CHANGELOG](https://github.com/aspuru-guzik-group/selfies/blob/master/CHANGELOG.md) | ||
| to review the changes between versions of `selfies`: | ||
| to review the changes between versions of `selfies`, before upgrading: | ||
@@ -59,28 +63,23 @@ ```bash | ||
| ## Documentation | ||
| The documentation can be found on | ||
| [ReadTheDocs](https://selfies.readthedocs.io/en/latest/). | ||
| Alternatively, it can be built from the ``docs/`` directory. | ||
| ## Usage | ||
| ### Standard Functions | ||
| ### Overview | ||
| The ``selfies`` library has eight standard functions: | ||
| Please refer to the [documentation](https://selfiesv2.readthedocs.io/en/latest/), | ||
| which contains a thorough tutorial for getting started with ``selfies`` | ||
| and detailed descriptions of the functions | ||
| that ``selfies`` provides. We summarize some key functions below. | ||
| | Function | Description | | ||
| | -------- | ----------- | | ||
| | ``selfies.encoder`` | Translates a SMILES into an equivalent SELFIES. | | ||
| | ``selfies.decoder`` | Translates a SELFIES into an equivalent SMILES. | | ||
| | ``selfies.len_selfies`` | Returns the (symbol) length of a SELFIES. | | ||
| | ``selfies.split_selfies`` | Splits a SELFIES into its symbols. | | ||
| | ``selfies.get_alphabet_from_selfies`` | Builds an alphabet of SELFIES symbols from an iterable of SELFIES. | | ||
| | ``selfies.get_semantic_robust_alphabet`` | Returns a subset of all SELFIES symbols that are semantically constrained. | | ||
| | ``selfies.selfies_to_encoding`` | Converts a SELFIES into a label and/or one-hot encoding. | | ||
| | ``selfies.encoding_to_selfies`` | Converts a label or one-hot encoding into a SELFIES. | | ||
| | ``selfies.encoder`` | Translates a SMILES string into its corresponding SELFIES string. | | ||
| | ``selfies.decoder`` | Translates a SELFIES string into its corresponding SMILES string. | | ||
| | ``selfies.set_semantic_constraints`` | Configures the semantic constraints that ``selfies`` operates on. | | ||
| | ``selfies.len_selfies`` | Returns the number of symbols in a SELFIES string. | | ||
| | ``selfies.split_selfies`` | Tokenizes a SELFIES string into its individual symbols. | | ||
| | ``selfies.get_alphabet_from_selfies`` | Constructs an alphabet from an iterable of SELFIES strings. | | ||
| | ``selfies.selfies_to_encoding`` | Converts a SELFIES string into its label and/or one-hot encoding. | | ||
| | ``selfies.encoding_to_selfies`` | Converts a label or one-hot encoding into a SELFIES string. | | ||
| Please read the documentation for more detailed descriptions of these | ||
| functions, and to view the advanced functions, which allow users to | ||
| customize the SELFIES language. | ||
@@ -96,19 +95,41 @@ ### Examples | ||
| # SMILES --> SELFIES translation | ||
| encoded_selfies = sf.encoder(benzene) # '[C][=C][C][=C][C][=C][Ring1][Branch1_2]' | ||
| # SMILES -> SELFIES -> SMILES translation | ||
| try: | ||
| benzene_sf = sf.encoder(benzene) # [C][=C][C][=C][C][=C][Ring1][=Branch1] | ||
| benzene_smi = sf.decoder(benzene_sf) # C1=CC=CC=C1 | ||
| except sf.EncoderError: | ||
| pass # sf.encoder error! | ||
| except sf.DecoderError: | ||
| pass # sf.decoder error! | ||
| # SELFIES --> SMILES translation | ||
| decoded_smiles = sf.decoder(encoded_selfies) # 'C1=CC=CC=C1' | ||
| len_benzene = sf.len_selfies(benzene_sf) # 8 | ||
| len_benzene = sf.len_selfies(encoded_selfies) # 8 | ||
| symbols_benzene = list(sf.split_selfies(benzene_sf)) | ||
| # ['[C]', '[=C]', '[C]', '[=C]', '[C]', '[=C]', '[Ring1]', '[=Branch1]'] | ||
| ``` | ||
| symbols_benzene = list(sf.split_selfies(encoded_selfies)) | ||
| # ['[C]', '[=C]', '[C]', '[=C]', '[C]', '[=C]', '[Ring1]', '[Branch1_2]'] | ||
| #### Customizing SELFIES: | ||
| In this example, we relax the semantic constraints of ``selfies`` to allow | ||
| for hypervalences (caution: hypervalence rules are much less understood | ||
| than octet rules. Some molecules containing hypervalences are important, | ||
| but generally, it is not known which molecules are stable and reasonable). | ||
| ```python | ||
| import selfies as sf | ||
| hypervalent_sf = sf.encoder('O=I(O)(O)(O)(O)O', strict=False) # orthoperiodic acid | ||
| standard_derived_smi = sf.decoder(hypervalent_sf) | ||
| # OI (the default constraints for I allows for only 1 bond) | ||
| sf.set_semantic_constraints("hypervalent") | ||
| relaxed_derived_smi = sf.decoder(hypervalent_sf) | ||
| # O=I(O)(O)(O)(O)O (the hypervalent constraints for I allows for 7 bonds) | ||
| ``` | ||
| #### Integer and one-hot encoding SELFIES: | ||
| In this example we first build an alphabet | ||
| from a dataset of SELFIES, and then convert a SELFIES into a | ||
| padded, label-encoded representation. Note that we use the | ||
| ``'[nop]'`` ([no operation](https://en.wikipedia.org/wiki/NOP_(code) )) | ||
| In this example, we first build an alphabet from a dataset of SELFIES strings, | ||
| and then convert a SELFIES string into its padded encoding. Note that we use the | ||
| ``[nop]`` ([no operation](https://en.wikipedia.org/wiki/NOP_(code) )) | ||
| symbol to pad our SELFIES, which is a special SELFIES symbol that is always | ||
@@ -121,7 +142,6 @@ ignored and skipped over by ``selfies.decoder``, making it a useful | ||
| dataset = ['[C][O][C]', '[F][C][F]', '[O][=O]', '[C][C][O][C][C]'] | ||
| dataset = ["[C][O][C]", "[F][C][F]", "[O][=O]", "[C][C][O][C][C]"] | ||
| alphabet = sf.get_alphabet_from_selfies(dataset) | ||
| alphabet.add('[nop]') # '[nop]' is a special padding symbol | ||
| alphabet = list(sorted(alphabet)) | ||
| print(alphabet) # ['[=O]', '[C]', '[F]', '[O]', '[nop]'] | ||
| alphabet.add("[nop]") # [nop] is a special padding symbol | ||
| alphabet = list(sorted(alphabet)) # ['[=O]', '[C]', '[F]', '[O]', '[nop]'] | ||
@@ -131,19 +151,15 @@ pad_to_len = max(sf.len_selfies(s) for s in dataset) # 5 | ||
| # SELFIES to label encode | ||
| dimethyl_ether = dataset[0] # '[C][O][C]' | ||
| dimethyl_ether = dataset[0] # [C][O][C] | ||
| # [1, 3, 1, 4, 4] | ||
| print(sf.selfies_to_encoding(dimethyl_ether, | ||
| vocab_stoi=symbol_to_idx, | ||
| pad_to_len=pad_to_len, | ||
| enc_type='label')) | ||
| # [[0, 1, 0, 0, 0], [0, 0, 0, 1, 0], [0, 1, 0, 0, 0], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1]] | ||
| print(sf.selfies_to_encoding(dimethyl_ether, | ||
| vocab_stoi=symbol_to_idx, | ||
| pad_to_len=pad_to_len, | ||
| enc_type='one_hot')) | ||
| label, one_hot = sf.selfies_to_encoding( | ||
| selfies=dimethyl_ether, | ||
| vocab_stoi=symbol_to_idx, | ||
| pad_to_len=pad_to_len, | ||
| enc_type="both" | ||
| ) | ||
| # label = [1, 3, 1, 4, 4] | ||
| # one_hot = [[0, 1, 0, 0, 0], [0, 0, 0, 1, 0], [0, 1, 0, 0, 0], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1]] | ||
| ``` | ||
| ### More Examples | ||
| ### More Usages and Examples | ||
@@ -158,19 +174,6 @@ * More examples can be found in the ``examples/`` directory, including a | ||
| * Kohulan Rajan, Achim Zielesny, Christoph Steinbeck show in two papers that SELFIES outperforms other representations in [img2string](https://link.springer.com/article/10.1186/s13321-020-00469-w) and [string2string](https://chemrxiv.org/articles/preprint/STOUT_SMILES_to_IUPAC_Names_Using_Neural_Machine_Translation/13469202/1) translation tasks, see the codes of [DECIMER](https://github.com/Kohulan/DECIMER-Image-to-SMILES) and [STOUT](https://github.com/Kohulan/Smiles-TO-iUpac-Translator). | ||
| * An improvement to the old genetic algorithm, the authors have also released [JANUS](https://arxiv.org/abs/2106.04011), which allows for more efficient optimization in the chemical space. JANUS makes use of [STONED-SELFIES](https://pubs.rsc.org/en/content/articlepdf/2021/sc/d1sc00231g) and a neural network for efficient sampling. | ||
| ## Handling invalid inputs | ||
| If an invalid input is presented to the encoder or decoder, the return value is `None`. | ||
| The error can be analysed by using the `encoder(...,print_error=True)` option. | ||
| ```python | ||
| import selfies as sf | ||
| invalid_smiles="C[C@H](O)[C@@(*)C1=CC=CC=C1" | ||
| selfies_string=sf.encoder(invalid_smiles) | ||
| if selfies_string==None: | ||
| selfies_string=sf.encoder(invalid_smiles,print_error=True) | ||
| # 'Encoding error 'C[C@H](O)[C@@(*)C1=CC=CC=C1': wildcard atom '*' not supported.' | ||
| ``` | ||
| ## Tests | ||
| SELFIES uses `pytest` with `tox` as its testing framework. | ||
| `selfies` uses `pytest` with `tox` as its testing framework. | ||
| All tests can be found in the `tests/` directory. To run the test suite for | ||
@@ -180,29 +183,17 @@ SELFIES, install ``tox`` and run: | ||
| ```bash | ||
| tox | ||
| tox -- --trials=10000 --dataset_samples=10000 | ||
| ``` | ||
| By default, SELFIES is tested against a random subset | ||
| (of size ``dataset_samples=100000``) on various datasets: | ||
| By default, `selfies` is tested against a random subset | ||
| (of size ``dataset_samples=10000``) on various datasets: | ||
| * 130K molecules from [QM9](https://www.nature.com/articles/sdata201422) | ||
| * 250K molecules from [ZINC](https://en.wikipedia.org/wiki/ZINC_database) | ||
| * 50K molecules from [non-fullerene acceptors for organic solar cells](https://www.sciencedirect.com/science/article/pii/S2542435117301307) | ||
| * 8K molecules from [Tox21](http://moleculenet.ai/datasets-1) in MoleculeNet | ||
| * 93K molecules from PubChem [MUV](http://moleculenet.ai/datasets-1) in MoleculeNet | ||
| * 27M molecules from the [eMolecules Plus Database](https://www.emolecules.com/info/plus/download-database). | ||
| * 50K molecules from a dataset of [non-fullerene acceptors for organic solar cells](https://www.sciencedirect.com/science/article/pii/S2542435117301307) | ||
| * 160K+ molecules from various [MoleculeNet](http://moleculenet.ai/datasets-1) datasets | ||
| * 36M+ molecules from the [eMolecules Database](https://www.emolecules.com/info/products-data-downloads.html). | ||
| Due to its large size, this dataset is not included on the repository. To run tests | ||
| on it, please download the dataset in the ``tests/test_sets`` directory | ||
| and enable its pytest at ``tests/test_on_emolecules.py``. | ||
| on it, please download the dataset into the ``tests/test_sets`` directory | ||
| and run the ``tests/run_on_large_dataset.py`` script. | ||
| Other tests are random and repeated ``trials`` number of times. | ||
| These can be specified as arguments | ||
| ```bash | ||
| tox -- --trials 100 --dataset_samples 100 | ||
| ``` | ||
| where ``--trials=100000`` and ``--dataset_samples=100000`` by default. Note that | ||
| if ``dataset_samples`` is negative or exceeds the length of the dataset, | ||
| the whole dataset is used. | ||
| ## Version History | ||
@@ -213,5 +204,5 @@ See [CHANGELOG](https://github.com/aspuru-guzik-group/selfies/blob/master/CHANGELOG.md). | ||
| We thank Jacques Boitreaud, Andrew Brereton, Matthew Carbone (x94carbone), Nathan Frey (ncfrey), Theophile Gaudin, | ||
| HelloJocelynLu, Hyunmin Kim (hmkim), Minjie Li, Vincent Mallet, Alexander Minidis (DocMinus), Kevin Ryan (LeanAndMean), | ||
| Benjamin Sanchez-Lengeling, and Zhenpeng Yao for their suggestions and bug reports, | ||
| We thank Jacques Boitreaud, Andrew Brereton, Nessa Carson (supersciencegrl), Matthew Carbone (x94carbone), Vladimir Chupakhin (chupvl), Nathan Frey (ncfrey), Theophile Gaudin, | ||
| HelloJocelynLu, Hyunmin Kim (hmkim), Minjie Li, Vincent Mallet, Alexander Minidis (DocMinus), Kohulan Rajan (Kohulan), | ||
| Kevin Ryan (LeanAndMean), Benjamin Sanchez-Lengeling, Andrew White, Zhenpeng Yao and Adamo Young for their suggestions and bug reports, | ||
| and Robert Pollice for chemistry advices. | ||
@@ -218,0 +209,0 @@ |
+87
-96
@@ -8,20 +8,24 @@ # SELFIES | ||
| [](https://GitHub.com/aspuru-guzik-group/selfies/issues/) | ||
| [](http://selfies.readthedocs.io/?badge=latest) | ||
| [](http://selfiesv2.readthedocs.io/?badge=latest) | ||
| [](https://GitHub.com/aspuru-guzik-group/selfies/graphs/contributors/) | ||
| **Self-Referencing Embedded Strings (SELFIES): A 100% robust molecular string representation**<br> | ||
| _Mario Krenn, Florian Haese, AkshatKumar Nigam, Pascal Friederich, Alan Aspuru-Guzik_<br> | ||
| [*Machine Learning: Science and Technology* **1**, 045024 (2020)](https://iopscience.iop.org/article/10.1088/2632-2153/aba947), [extensive blog post January 2021](https://aspuru.substack.com/p/molecular-graph-representations-and).<br> | ||
| [Talk on youtube about SELFIES](https://www.youtube.com/watch?v=CaIyUmfGXDk).<br> | ||
| Major contributors since v1.0.0: _[Alston Lo](https://github.com/aspuru-guzik-group/selfies/commits?author=alstonlo) and [Seyone Chithrananda](https://github.com/seyonechithrananda)_<br> | ||
| **Self-Referencing Embedded Strings (SELFIES): A 100% robust molecular string representation**\ | ||
| _Mario Krenn, Florian Haese, AkshatKumar Nigam, Pascal Friederich, Alan Aspuru-Guzik_\ | ||
| [*Machine Learning: Science and Technology* **1**, 045024 (2020)](https://iopscience.iop.org/article/10.1088/2632-2153/aba947), [extensive blog post January 2021](https://aspuru.substack.com/p/molecular-graph-representations-and).\ | ||
| [Talk on youtube about SELFIES](https://www.youtube.com/watch?v=CaIyUmfGXDk).\ | ||
| [Blog explaining SELFIES in Japanese language](https://blacktanktop.hatenablog.com/entry/2021/08/12/115613)\ | ||
| Major contributors since v1.0.0: _[Alston Lo](https://github.com/alstonlo) and [Seyone Chithrananda](https://github.com/seyonechithrananda)_\ | ||
| Chemistry Advisor: [Robert Pollice](https://scholar.google.at/citations?user=JR2N3JIAAAAJ) | ||
| A main objective is to use SELFIES as direct input into machine learning models,<br> | ||
| in particular in generative models, for the generation of molecular graphs<br> | ||
| --- | ||
| A main objective is to use SELFIES as direct input into machine learning models, | ||
| in particular in generative models, for the generation of molecular graphs | ||
| which are syntactically and semantically valid. | ||
| <center><img src="https://github.com/aspuru-guzik-group/selfies/blob/master/examples/VAE_LS_Validity.png" alt="SELFIES validity in a VAE latent space" width="666px"></center> | ||
| <p align="center"> | ||
| <img src="https://github.com/aspuru-guzik-group/selfies/blob/master/examples/VAE_LS_Validity.png" alt="SELFIES validity in a VAE latent space" width="666px"> | ||
| </p> | ||
| ## Installation | ||
@@ -44,3 +48,3 @@ Use pip to install ``selfies``. | ||
| [CHANGELOG](https://github.com/aspuru-guzik-group/selfies/blob/master/CHANGELOG.md) | ||
| to review the changes between versions of `selfies`: | ||
| to review the changes between versions of `selfies`, before upgrading: | ||
@@ -51,28 +55,23 @@ ```bash | ||
| ## Documentation | ||
| The documentation can be found on | ||
| [ReadTheDocs](https://selfies.readthedocs.io/en/latest/). | ||
| Alternatively, it can be built from the ``docs/`` directory. | ||
| ## Usage | ||
| ### Standard Functions | ||
| ### Overview | ||
| The ``selfies`` library has eight standard functions: | ||
| Please refer to the [documentation](https://selfiesv2.readthedocs.io/en/latest/), | ||
| which contains a thorough tutorial for getting started with ``selfies`` | ||
| and detailed descriptions of the functions | ||
| that ``selfies`` provides. We summarize some key functions below. | ||
| | Function | Description | | ||
| | -------- | ----------- | | ||
| | ``selfies.encoder`` | Translates a SMILES into an equivalent SELFIES. | | ||
| | ``selfies.decoder`` | Translates a SELFIES into an equivalent SMILES. | | ||
| | ``selfies.len_selfies`` | Returns the (symbol) length of a SELFIES. | | ||
| | ``selfies.split_selfies`` | Splits a SELFIES into its symbols. | | ||
| | ``selfies.get_alphabet_from_selfies`` | Builds an alphabet of SELFIES symbols from an iterable of SELFIES. | | ||
| | ``selfies.get_semantic_robust_alphabet`` | Returns a subset of all SELFIES symbols that are semantically constrained. | | ||
| | ``selfies.selfies_to_encoding`` | Converts a SELFIES into a label and/or one-hot encoding. | | ||
| | ``selfies.encoding_to_selfies`` | Converts a label or one-hot encoding into a SELFIES. | | ||
| | ``selfies.encoder`` | Translates a SMILES string into its corresponding SELFIES string. | | ||
| | ``selfies.decoder`` | Translates a SELFIES string into its corresponding SMILES string. | | ||
| | ``selfies.set_semantic_constraints`` | Configures the semantic constraints that ``selfies`` operates on. | | ||
| | ``selfies.len_selfies`` | Returns the number of symbols in a SELFIES string. | | ||
| | ``selfies.split_selfies`` | Tokenizes a SELFIES string into its individual symbols. | | ||
| | ``selfies.get_alphabet_from_selfies`` | Constructs an alphabet from an iterable of SELFIES strings. | | ||
| | ``selfies.selfies_to_encoding`` | Converts a SELFIES string into its label and/or one-hot encoding. | | ||
| | ``selfies.encoding_to_selfies`` | Converts a label or one-hot encoding into a SELFIES string. | | ||
| Please read the documentation for more detailed descriptions of these | ||
| functions, and to view the advanced functions, which allow users to | ||
| customize the SELFIES language. | ||
@@ -88,19 +87,41 @@ ### Examples | ||
| # SMILES --> SELFIES translation | ||
| encoded_selfies = sf.encoder(benzene) # '[C][=C][C][=C][C][=C][Ring1][Branch1_2]' | ||
| # SMILES -> SELFIES -> SMILES translation | ||
| try: | ||
| benzene_sf = sf.encoder(benzene) # [C][=C][C][=C][C][=C][Ring1][=Branch1] | ||
| benzene_smi = sf.decoder(benzene_sf) # C1=CC=CC=C1 | ||
| except sf.EncoderError: | ||
| pass # sf.encoder error! | ||
| except sf.DecoderError: | ||
| pass # sf.decoder error! | ||
| # SELFIES --> SMILES translation | ||
| decoded_smiles = sf.decoder(encoded_selfies) # 'C1=CC=CC=C1' | ||
| len_benzene = sf.len_selfies(benzene_sf) # 8 | ||
| len_benzene = sf.len_selfies(encoded_selfies) # 8 | ||
| symbols_benzene = list(sf.split_selfies(benzene_sf)) | ||
| # ['[C]', '[=C]', '[C]', '[=C]', '[C]', '[=C]', '[Ring1]', '[=Branch1]'] | ||
| ``` | ||
| symbols_benzene = list(sf.split_selfies(encoded_selfies)) | ||
| # ['[C]', '[=C]', '[C]', '[=C]', '[C]', '[=C]', '[Ring1]', '[Branch1_2]'] | ||
| #### Customizing SELFIES: | ||
| In this example, we relax the semantic constraints of ``selfies`` to allow | ||
| for hypervalences (caution: hypervalence rules are much less understood | ||
| than octet rules. Some molecules containing hypervalences are important, | ||
| but generally, it is not known which molecules are stable and reasonable). | ||
| ```python | ||
| import selfies as sf | ||
| hypervalent_sf = sf.encoder('O=I(O)(O)(O)(O)O', strict=False) # orthoperiodic acid | ||
| standard_derived_smi = sf.decoder(hypervalent_sf) | ||
| # OI (the default constraints for I allows for only 1 bond) | ||
| sf.set_semantic_constraints("hypervalent") | ||
| relaxed_derived_smi = sf.decoder(hypervalent_sf) | ||
| # O=I(O)(O)(O)(O)O (the hypervalent constraints for I allows for 7 bonds) | ||
| ``` | ||
| #### Integer and one-hot encoding SELFIES: | ||
| In this example we first build an alphabet | ||
| from a dataset of SELFIES, and then convert a SELFIES into a | ||
| padded, label-encoded representation. Note that we use the | ||
| ``'[nop]'`` ([no operation](https://en.wikipedia.org/wiki/NOP_(code) )) | ||
| In this example, we first build an alphabet from a dataset of SELFIES strings, | ||
| and then convert a SELFIES string into its padded encoding. Note that we use the | ||
| ``[nop]`` ([no operation](https://en.wikipedia.org/wiki/NOP_(code) )) | ||
| symbol to pad our SELFIES, which is a special SELFIES symbol that is always | ||
@@ -113,7 +134,6 @@ ignored and skipped over by ``selfies.decoder``, making it a useful | ||
| dataset = ['[C][O][C]', '[F][C][F]', '[O][=O]', '[C][C][O][C][C]'] | ||
| dataset = ["[C][O][C]", "[F][C][F]", "[O][=O]", "[C][C][O][C][C]"] | ||
| alphabet = sf.get_alphabet_from_selfies(dataset) | ||
| alphabet.add('[nop]') # '[nop]' is a special padding symbol | ||
| alphabet = list(sorted(alphabet)) | ||
| print(alphabet) # ['[=O]', '[C]', '[F]', '[O]', '[nop]'] | ||
| alphabet.add("[nop]") # [nop] is a special padding symbol | ||
| alphabet = list(sorted(alphabet)) # ['[=O]', '[C]', '[F]', '[O]', '[nop]'] | ||
@@ -123,19 +143,15 @@ pad_to_len = max(sf.len_selfies(s) for s in dataset) # 5 | ||
| # SELFIES to label encode | ||
| dimethyl_ether = dataset[0] # '[C][O][C]' | ||
| dimethyl_ether = dataset[0] # [C][O][C] | ||
| # [1, 3, 1, 4, 4] | ||
| print(sf.selfies_to_encoding(dimethyl_ether, | ||
| vocab_stoi=symbol_to_idx, | ||
| pad_to_len=pad_to_len, | ||
| enc_type='label')) | ||
| # [[0, 1, 0, 0, 0], [0, 0, 0, 1, 0], [0, 1, 0, 0, 0], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1]] | ||
| print(sf.selfies_to_encoding(dimethyl_ether, | ||
| vocab_stoi=symbol_to_idx, | ||
| pad_to_len=pad_to_len, | ||
| enc_type='one_hot')) | ||
| label, one_hot = sf.selfies_to_encoding( | ||
| selfies=dimethyl_ether, | ||
| vocab_stoi=symbol_to_idx, | ||
| pad_to_len=pad_to_len, | ||
| enc_type="both" | ||
| ) | ||
| # label = [1, 3, 1, 4, 4] | ||
| # one_hot = [[0, 1, 0, 0, 0], [0, 0, 0, 1, 0], [0, 1, 0, 0, 0], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1]] | ||
| ``` | ||
| ### More Examples | ||
| ### More Usages and Examples | ||
@@ -150,19 +166,6 @@ * More examples can be found in the ``examples/`` directory, including a | ||
| * Kohulan Rajan, Achim Zielesny, Christoph Steinbeck show in two papers that SELFIES outperforms other representations in [img2string](https://link.springer.com/article/10.1186/s13321-020-00469-w) and [string2string](https://chemrxiv.org/articles/preprint/STOUT_SMILES_to_IUPAC_Names_Using_Neural_Machine_Translation/13469202/1) translation tasks, see the codes of [DECIMER](https://github.com/Kohulan/DECIMER-Image-to-SMILES) and [STOUT](https://github.com/Kohulan/Smiles-TO-iUpac-Translator). | ||
| * An improvement to the old genetic algorithm, the authors have also released [JANUS](https://arxiv.org/abs/2106.04011), which allows for more efficient optimization in the chemical space. JANUS makes use of [STONED-SELFIES](https://pubs.rsc.org/en/content/articlepdf/2021/sc/d1sc00231g) and a neural network for efficient sampling. | ||
| ## Handling invalid inputs | ||
| If an invalid input is presented to the encoder or decoder, the return value is `None`. | ||
| The error can be analysed by using the `encoder(...,print_error=True)` option. | ||
| ```python | ||
| import selfies as sf | ||
| invalid_smiles="C[C@H](O)[C@@(*)C1=CC=CC=C1" | ||
| selfies_string=sf.encoder(invalid_smiles) | ||
| if selfies_string==None: | ||
| selfies_string=sf.encoder(invalid_smiles,print_error=True) | ||
| # 'Encoding error 'C[C@H](O)[C@@(*)C1=CC=CC=C1': wildcard atom '*' not supported.' | ||
| ``` | ||
| ## Tests | ||
| SELFIES uses `pytest` with `tox` as its testing framework. | ||
| `selfies` uses `pytest` with `tox` as its testing framework. | ||
| All tests can be found in the `tests/` directory. To run the test suite for | ||
@@ -172,29 +175,17 @@ SELFIES, install ``tox`` and run: | ||
| ```bash | ||
| tox | ||
| tox -- --trials=10000 --dataset_samples=10000 | ||
| ``` | ||
| By default, SELFIES is tested against a random subset | ||
| (of size ``dataset_samples=100000``) on various datasets: | ||
| By default, `selfies` is tested against a random subset | ||
| (of size ``dataset_samples=10000``) on various datasets: | ||
| * 130K molecules from [QM9](https://www.nature.com/articles/sdata201422) | ||
| * 250K molecules from [ZINC](https://en.wikipedia.org/wiki/ZINC_database) | ||
| * 50K molecules from [non-fullerene acceptors for organic solar cells](https://www.sciencedirect.com/science/article/pii/S2542435117301307) | ||
| * 8K molecules from [Tox21](http://moleculenet.ai/datasets-1) in MoleculeNet | ||
| * 93K molecules from PubChem [MUV](http://moleculenet.ai/datasets-1) in MoleculeNet | ||
| * 27M molecules from the [eMolecules Plus Database](https://www.emolecules.com/info/plus/download-database). | ||
| * 50K molecules from a dataset of [non-fullerene acceptors for organic solar cells](https://www.sciencedirect.com/science/article/pii/S2542435117301307) | ||
| * 160K+ molecules from various [MoleculeNet](http://moleculenet.ai/datasets-1) datasets | ||
| * 36M+ molecules from the [eMolecules Database](https://www.emolecules.com/info/products-data-downloads.html). | ||
| Due to its large size, this dataset is not included on the repository. To run tests | ||
| on it, please download the dataset in the ``tests/test_sets`` directory | ||
| and enable its pytest at ``tests/test_on_emolecules.py``. | ||
| on it, please download the dataset into the ``tests/test_sets`` directory | ||
| and run the ``tests/run_on_large_dataset.py`` script. | ||
| Other tests are random and repeated ``trials`` number of times. | ||
| These can be specified as arguments | ||
| ```bash | ||
| tox -- --trials 100 --dataset_samples 100 | ||
| ``` | ||
| where ``--trials=100000`` and ``--dataset_samples=100000`` by default. Note that | ||
| if ``dataset_samples`` is negative or exceeds the length of the dataset, | ||
| the whole dataset is used. | ||
| ## Version History | ||
@@ -205,5 +196,5 @@ See [CHANGELOG](https://github.com/aspuru-guzik-group/selfies/blob/master/CHANGELOG.md). | ||
| We thank Jacques Boitreaud, Andrew Brereton, Matthew Carbone (x94carbone), Nathan Frey (ncfrey), Theophile Gaudin, | ||
| HelloJocelynLu, Hyunmin Kim (hmkim), Minjie Li, Vincent Mallet, Alexander Minidis (DocMinus), Kevin Ryan (LeanAndMean), | ||
| Benjamin Sanchez-Lengeling, and Zhenpeng Yao for their suggestions and bug reports, | ||
| We thank Jacques Boitreaud, Andrew Brereton, Nessa Carson (supersciencegrl), Matthew Carbone (x94carbone), Vladimir Chupakhin (chupvl), Nathan Frey (ncfrey), Theophile Gaudin, | ||
| HelloJocelynLu, Hyunmin Kim (hmkim), Minjie Li, Vincent Mallet, Alexander Minidis (DocMinus), Kohulan Rajan (Kohulan), | ||
| Kevin Ryan (LeanAndMean), Benjamin Sanchez-Lengeling, Andrew White, Zhenpeng Yao and Adamo Young for their suggestions and bug reports, | ||
| and Robert Pollice for chemistry advices. | ||
@@ -210,0 +201,0 @@ |
| Metadata-Version: 2.1 | ||
| Name: selfies | ||
| Version: 1.0.4 | ||
| Version: 2.0.0 | ||
| Summary: SELFIES (SELF-referencIng Embedded Strings) is a general-purpose, sequence-based, robust representation of semantically constrained graphs. | ||
| Home-page: https://github.com/aspuru-guzik-group/selfies | ||
| Author: Mario Krenn | ||
| Author: Mario Krenn, Alston Lo, and many other contributors | ||
| Author-email: mario.krenn@utoronto.ca, alan@aspuru.com | ||
@@ -16,20 +16,24 @@ License: UNKNOWN | ||
| [](https://GitHub.com/aspuru-guzik-group/selfies/issues/) | ||
| [](http://selfies.readthedocs.io/?badge=latest) | ||
| [](http://selfiesv2.readthedocs.io/?badge=latest) | ||
| [](https://GitHub.com/aspuru-guzik-group/selfies/graphs/contributors/) | ||
| **Self-Referencing Embedded Strings (SELFIES): A 100% robust molecular string representation**<br> | ||
| _Mario Krenn, Florian Haese, AkshatKumar Nigam, Pascal Friederich, Alan Aspuru-Guzik_<br> | ||
| [*Machine Learning: Science and Technology* **1**, 045024 (2020)](https://iopscience.iop.org/article/10.1088/2632-2153/aba947), [extensive blog post January 2021](https://aspuru.substack.com/p/molecular-graph-representations-and).<br> | ||
| [Talk on youtube about SELFIES](https://www.youtube.com/watch?v=CaIyUmfGXDk).<br> | ||
| Major contributors since v1.0.0: _[Alston Lo](https://github.com/aspuru-guzik-group/selfies/commits?author=alstonlo) and [Seyone Chithrananda](https://github.com/seyonechithrananda)_<br> | ||
| **Self-Referencing Embedded Strings (SELFIES): A 100% robust molecular string representation**\ | ||
| _Mario Krenn, Florian Haese, AkshatKumar Nigam, Pascal Friederich, Alan Aspuru-Guzik_\ | ||
| [*Machine Learning: Science and Technology* **1**, 045024 (2020)](https://iopscience.iop.org/article/10.1088/2632-2153/aba947), [extensive blog post January 2021](https://aspuru.substack.com/p/molecular-graph-representations-and).\ | ||
| [Talk on youtube about SELFIES](https://www.youtube.com/watch?v=CaIyUmfGXDk).\ | ||
| [Blog explaining SELFIES in Japanese language](https://blacktanktop.hatenablog.com/entry/2021/08/12/115613)\ | ||
| Major contributors since v1.0.0: _[Alston Lo](https://github.com/alstonlo) and [Seyone Chithrananda](https://github.com/seyonechithrananda)_\ | ||
| Chemistry Advisor: [Robert Pollice](https://scholar.google.at/citations?user=JR2N3JIAAAAJ) | ||
| A main objective is to use SELFIES as direct input into machine learning models,<br> | ||
| in particular in generative models, for the generation of molecular graphs<br> | ||
| --- | ||
| A main objective is to use SELFIES as direct input into machine learning models, | ||
| in particular in generative models, for the generation of molecular graphs | ||
| which are syntactically and semantically valid. | ||
| <center><img src="https://github.com/aspuru-guzik-group/selfies/blob/master/examples/VAE_LS_Validity.png" alt="SELFIES validity in a VAE latent space" width="666px"></center> | ||
| <p align="center"> | ||
| <img src="https://github.com/aspuru-guzik-group/selfies/blob/master/examples/VAE_LS_Validity.png" alt="SELFIES validity in a VAE latent space" width="666px"> | ||
| </p> | ||
| ## Installation | ||
@@ -52,3 +56,3 @@ Use pip to install ``selfies``. | ||
| [CHANGELOG](https://github.com/aspuru-guzik-group/selfies/blob/master/CHANGELOG.md) | ||
| to review the changes between versions of `selfies`: | ||
| to review the changes between versions of `selfies`, before upgrading: | ||
@@ -59,28 +63,23 @@ ```bash | ||
| ## Documentation | ||
| The documentation can be found on | ||
| [ReadTheDocs](https://selfies.readthedocs.io/en/latest/). | ||
| Alternatively, it can be built from the ``docs/`` directory. | ||
| ## Usage | ||
| ### Standard Functions | ||
| ### Overview | ||
| The ``selfies`` library has eight standard functions: | ||
| Please refer to the [documentation](https://selfiesv2.readthedocs.io/en/latest/), | ||
| which contains a thorough tutorial for getting started with ``selfies`` | ||
| and detailed descriptions of the functions | ||
| that ``selfies`` provides. We summarize some key functions below. | ||
| | Function | Description | | ||
| | -------- | ----------- | | ||
| | ``selfies.encoder`` | Translates a SMILES into an equivalent SELFIES. | | ||
| | ``selfies.decoder`` | Translates a SELFIES into an equivalent SMILES. | | ||
| | ``selfies.len_selfies`` | Returns the (symbol) length of a SELFIES. | | ||
| | ``selfies.split_selfies`` | Splits a SELFIES into its symbols. | | ||
| | ``selfies.get_alphabet_from_selfies`` | Builds an alphabet of SELFIES symbols from an iterable of SELFIES. | | ||
| | ``selfies.get_semantic_robust_alphabet`` | Returns a subset of all SELFIES symbols that are semantically constrained. | | ||
| | ``selfies.selfies_to_encoding`` | Converts a SELFIES into a label and/or one-hot encoding. | | ||
| | ``selfies.encoding_to_selfies`` | Converts a label or one-hot encoding into a SELFIES. | | ||
| | ``selfies.encoder`` | Translates a SMILES string into its corresponding SELFIES string. | | ||
| | ``selfies.decoder`` | Translates a SELFIES string into its corresponding SMILES string. | | ||
| | ``selfies.set_semantic_constraints`` | Configures the semantic constraints that ``selfies`` operates on. | | ||
| | ``selfies.len_selfies`` | Returns the number of symbols in a SELFIES string. | | ||
| | ``selfies.split_selfies`` | Tokenizes a SELFIES string into its individual symbols. | | ||
| | ``selfies.get_alphabet_from_selfies`` | Constructs an alphabet from an iterable of SELFIES strings. | | ||
| | ``selfies.selfies_to_encoding`` | Converts a SELFIES string into its label and/or one-hot encoding. | | ||
| | ``selfies.encoding_to_selfies`` | Converts a label or one-hot encoding into a SELFIES string. | | ||
| Please read the documentation for more detailed descriptions of these | ||
| functions, and to view the advanced functions, which allow users to | ||
| customize the SELFIES language. | ||
@@ -96,19 +95,41 @@ ### Examples | ||
| # SMILES --> SELFIES translation | ||
| encoded_selfies = sf.encoder(benzene) # '[C][=C][C][=C][C][=C][Ring1][Branch1_2]' | ||
| # SMILES -> SELFIES -> SMILES translation | ||
| try: | ||
| benzene_sf = sf.encoder(benzene) # [C][=C][C][=C][C][=C][Ring1][=Branch1] | ||
| benzene_smi = sf.decoder(benzene_sf) # C1=CC=CC=C1 | ||
| except sf.EncoderError: | ||
| pass # sf.encoder error! | ||
| except sf.DecoderError: | ||
| pass # sf.decoder error! | ||
| # SELFIES --> SMILES translation | ||
| decoded_smiles = sf.decoder(encoded_selfies) # 'C1=CC=CC=C1' | ||
| len_benzene = sf.len_selfies(benzene_sf) # 8 | ||
| len_benzene = sf.len_selfies(encoded_selfies) # 8 | ||
| symbols_benzene = list(sf.split_selfies(benzene_sf)) | ||
| # ['[C]', '[=C]', '[C]', '[=C]', '[C]', '[=C]', '[Ring1]', '[=Branch1]'] | ||
| ``` | ||
| symbols_benzene = list(sf.split_selfies(encoded_selfies)) | ||
| # ['[C]', '[=C]', '[C]', '[=C]', '[C]', '[=C]', '[Ring1]', '[Branch1_2]'] | ||
| #### Customizing SELFIES: | ||
| In this example, we relax the semantic constraints of ``selfies`` to allow | ||
| for hypervalences (caution: hypervalence rules are much less understood | ||
| than octet rules. Some molecules containing hypervalences are important, | ||
| but generally, it is not known which molecules are stable and reasonable). | ||
| ```python | ||
| import selfies as sf | ||
| hypervalent_sf = sf.encoder('O=I(O)(O)(O)(O)O', strict=False) # orthoperiodic acid | ||
| standard_derived_smi = sf.decoder(hypervalent_sf) | ||
| # OI (the default constraints for I allows for only 1 bond) | ||
| sf.set_semantic_constraints("hypervalent") | ||
| relaxed_derived_smi = sf.decoder(hypervalent_sf) | ||
| # O=I(O)(O)(O)(O)O (the hypervalent constraints for I allows for 7 bonds) | ||
| ``` | ||
| #### Integer and one-hot encoding SELFIES: | ||
| In this example we first build an alphabet | ||
| from a dataset of SELFIES, and then convert a SELFIES into a | ||
| padded, label-encoded representation. Note that we use the | ||
| ``'[nop]'`` ([no operation](https://en.wikipedia.org/wiki/NOP_(code) )) | ||
| In this example, we first build an alphabet from a dataset of SELFIES strings, | ||
| and then convert a SELFIES string into its padded encoding. Note that we use the | ||
| ``[nop]`` ([no operation](https://en.wikipedia.org/wiki/NOP_(code) )) | ||
| symbol to pad our SELFIES, which is a special SELFIES symbol that is always | ||
@@ -121,7 +142,6 @@ ignored and skipped over by ``selfies.decoder``, making it a useful | ||
| dataset = ['[C][O][C]', '[F][C][F]', '[O][=O]', '[C][C][O][C][C]'] | ||
| dataset = ["[C][O][C]", "[F][C][F]", "[O][=O]", "[C][C][O][C][C]"] | ||
| alphabet = sf.get_alphabet_from_selfies(dataset) | ||
| alphabet.add('[nop]') # '[nop]' is a special padding symbol | ||
| alphabet = list(sorted(alphabet)) | ||
| print(alphabet) # ['[=O]', '[C]', '[F]', '[O]', '[nop]'] | ||
| alphabet.add("[nop]") # [nop] is a special padding symbol | ||
| alphabet = list(sorted(alphabet)) # ['[=O]', '[C]', '[F]', '[O]', '[nop]'] | ||
@@ -131,19 +151,15 @@ pad_to_len = max(sf.len_selfies(s) for s in dataset) # 5 | ||
| # SELFIES to label encode | ||
| dimethyl_ether = dataset[0] # '[C][O][C]' | ||
| dimethyl_ether = dataset[0] # [C][O][C] | ||
| # [1, 3, 1, 4, 4] | ||
| print(sf.selfies_to_encoding(dimethyl_ether, | ||
| vocab_stoi=symbol_to_idx, | ||
| pad_to_len=pad_to_len, | ||
| enc_type='label')) | ||
| # [[0, 1, 0, 0, 0], [0, 0, 0, 1, 0], [0, 1, 0, 0, 0], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1]] | ||
| print(sf.selfies_to_encoding(dimethyl_ether, | ||
| vocab_stoi=symbol_to_idx, | ||
| pad_to_len=pad_to_len, | ||
| enc_type='one_hot')) | ||
| label, one_hot = sf.selfies_to_encoding( | ||
| selfies=dimethyl_ether, | ||
| vocab_stoi=symbol_to_idx, | ||
| pad_to_len=pad_to_len, | ||
| enc_type="both" | ||
| ) | ||
| # label = [1, 3, 1, 4, 4] | ||
| # one_hot = [[0, 1, 0, 0, 0], [0, 0, 0, 1, 0], [0, 1, 0, 0, 0], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1]] | ||
| ``` | ||
| ### More Examples | ||
| ### More Usages and Examples | ||
@@ -158,19 +174,6 @@ * More examples can be found in the ``examples/`` directory, including a | ||
| * Kohulan Rajan, Achim Zielesny, Christoph Steinbeck show in two papers that SELFIES outperforms other representations in [img2string](https://link.springer.com/article/10.1186/s13321-020-00469-w) and [string2string](https://chemrxiv.org/articles/preprint/STOUT_SMILES_to_IUPAC_Names_Using_Neural_Machine_Translation/13469202/1) translation tasks, see the codes of [DECIMER](https://github.com/Kohulan/DECIMER-Image-to-SMILES) and [STOUT](https://github.com/Kohulan/Smiles-TO-iUpac-Translator). | ||
| * An improvement to the old genetic algorithm, the authors have also released [JANUS](https://arxiv.org/abs/2106.04011), which allows for more efficient optimization in the chemical space. JANUS makes use of [STONED-SELFIES](https://pubs.rsc.org/en/content/articlepdf/2021/sc/d1sc00231g) and a neural network for efficient sampling. | ||
| ## Handling invalid inputs | ||
| If an invalid input is presented to the encoder or decoder, the return value is `None`. | ||
| The error can be analysed by using the `encoder(...,print_error=True)` option. | ||
| ```python | ||
| import selfies as sf | ||
| invalid_smiles="C[C@H](O)[C@@(*)C1=CC=CC=C1" | ||
| selfies_string=sf.encoder(invalid_smiles) | ||
| if selfies_string==None: | ||
| selfies_string=sf.encoder(invalid_smiles,print_error=True) | ||
| # 'Encoding error 'C[C@H](O)[C@@(*)C1=CC=CC=C1': wildcard atom '*' not supported.' | ||
| ``` | ||
| ## Tests | ||
| SELFIES uses `pytest` with `tox` as its testing framework. | ||
| `selfies` uses `pytest` with `tox` as its testing framework. | ||
| All tests can be found in the `tests/` directory. To run the test suite for | ||
@@ -180,29 +183,17 @@ SELFIES, install ``tox`` and run: | ||
| ```bash | ||
| tox | ||
| tox -- --trials=10000 --dataset_samples=10000 | ||
| ``` | ||
| By default, SELFIES is tested against a random subset | ||
| (of size ``dataset_samples=100000``) on various datasets: | ||
| By default, `selfies` is tested against a random subset | ||
| (of size ``dataset_samples=10000``) on various datasets: | ||
| * 130K molecules from [QM9](https://www.nature.com/articles/sdata201422) | ||
| * 250K molecules from [ZINC](https://en.wikipedia.org/wiki/ZINC_database) | ||
| * 50K molecules from [non-fullerene acceptors for organic solar cells](https://www.sciencedirect.com/science/article/pii/S2542435117301307) | ||
| * 8K molecules from [Tox21](http://moleculenet.ai/datasets-1) in MoleculeNet | ||
| * 93K molecules from PubChem [MUV](http://moleculenet.ai/datasets-1) in MoleculeNet | ||
| * 27M molecules from the [eMolecules Plus Database](https://www.emolecules.com/info/plus/download-database). | ||
| * 50K molecules from a dataset of [non-fullerene acceptors for organic solar cells](https://www.sciencedirect.com/science/article/pii/S2542435117301307) | ||
| * 160K+ molecules from various [MoleculeNet](http://moleculenet.ai/datasets-1) datasets | ||
| * 36M+ molecules from the [eMolecules Database](https://www.emolecules.com/info/products-data-downloads.html). | ||
| Due to its large size, this dataset is not included on the repository. To run tests | ||
| on it, please download the dataset in the ``tests/test_sets`` directory | ||
| and enable its pytest at ``tests/test_on_emolecules.py``. | ||
| on it, please download the dataset into the ``tests/test_sets`` directory | ||
| and run the ``tests/run_on_large_dataset.py`` script. | ||
| Other tests are random and repeated ``trials`` number of times. | ||
| These can be specified as arguments | ||
| ```bash | ||
| tox -- --trials 100 --dataset_samples 100 | ||
| ``` | ||
| where ``--trials=100000`` and ``--dataset_samples=100000`` by default. Note that | ||
| if ``dataset_samples`` is negative or exceeds the length of the dataset, | ||
| the whole dataset is used. | ||
| ## Version History | ||
@@ -213,5 +204,5 @@ See [CHANGELOG](https://github.com/aspuru-guzik-group/selfies/blob/master/CHANGELOG.md). | ||
| We thank Jacques Boitreaud, Andrew Brereton, Matthew Carbone (x94carbone), Nathan Frey (ncfrey), Theophile Gaudin, | ||
| HelloJocelynLu, Hyunmin Kim (hmkim), Minjie Li, Vincent Mallet, Alexander Minidis (DocMinus), Kevin Ryan (LeanAndMean), | ||
| Benjamin Sanchez-Lengeling, and Zhenpeng Yao for their suggestions and bug reports, | ||
| We thank Jacques Boitreaud, Andrew Brereton, Nessa Carson (supersciencegrl), Matthew Carbone (x94carbone), Vladimir Chupakhin (chupvl), Nathan Frey (ncfrey), Theophile Gaudin, | ||
| HelloJocelynLu, Hyunmin Kim (hmkim), Minjie Li, Vincent Mallet, Alexander Minidis (DocMinus), Kohulan Rajan (Kohulan), | ||
| Kevin Ryan (LeanAndMean), Benjamin Sanchez-Lengeling, Andrew White, Zhenpeng Yao and Adamo Young for their suggestions and bug reports, | ||
| and Robert Pollice for chemistry advices. | ||
@@ -218,0 +209,0 @@ |
| README.md | ||
| setup.py | ||
| selfies/__init__.py | ||
| selfies/bond_constraints.py | ||
| selfies/compatibility.py | ||
| selfies/constants.py | ||
| selfies/decoder.py | ||
| selfies/encoder.py | ||
| selfies/exceptions.py | ||
| selfies/grammar_rules.py | ||
| selfies/kekulize.py | ||
| selfies/utils.py | ||
| selfies/mol_graph.py | ||
| selfies.egg-info/PKG-INFO | ||
| selfies.egg-info/SOURCES.txt | ||
| selfies.egg-info/dependency_links.txt | ||
| selfies.egg-info/top_level.txt | ||
| selfies.egg-info/top_level.txt | ||
| selfies/utils/__init__.py | ||
| selfies/utils/encoding_utils.py | ||
| selfies/utils/linked_list.py | ||
| selfies/utils/matching_utils.py | ||
| selfies/utils/selfies_utils.py | ||
| selfies/utils/smiles_utils.py |
+21
-20
@@ -18,7 +18,7 @@ #!/usr/bin/env python | ||
| Typical usage example: | ||
| import selfies | ||
| import selfies as sf | ||
| benzene = "C1=CC=CC=C1" | ||
| selfies_benzene = selfies.encoder(benzene) | ||
| smiles_benzene = selfies.decoder(selfies_benzene) | ||
| benzene_selfies = sf.encoder(benzene) | ||
| benzene_smiles = sf.decoder(benzene_selfies) | ||
@@ -29,3 +29,3 @@ For comments, bug reports or feature ideas, please send an email to | ||
| __version__ = "1.0.3" | ||
| __version__ = "2.0.0" | ||
@@ -35,6 +35,4 @@ __all__ = [ | ||
| "decoder", | ||
| "get_preset_constraints", | ||
| "get_semantic_robust_alphabet", | ||
| "get_default_constraints", | ||
| "get_octet_rule_constraints", | ||
| "get_hypervalent_constraints", | ||
| "get_semantic_constraints", | ||
@@ -49,22 +47,25 @@ "set_semantic_constraints", | ||
| "batch_flat_hot_to_selfies", | ||
| "EncoderError", | ||
| "DecoderError" | ||
| ] | ||
| from .bond_constraints import ( | ||
| get_preset_constraints, | ||
| get_semantic_constraints, | ||
| get_semantic_robust_alphabet, | ||
| set_semantic_constraints | ||
| ) | ||
| from .decoder import decoder | ||
| from .encoder import encoder | ||
| from .grammar_rules import ( | ||
| get_semantic_robust_alphabet, | ||
| get_default_constraints, | ||
| get_octet_rule_constraints, | ||
| get_hypervalent_constraints, | ||
| get_semantic_constraints, | ||
| set_semantic_constraints, | ||
| from .exceptions import DecoderError, EncoderError | ||
| from .utils.encoding_utils import ( | ||
| batch_flat_hot_to_selfies, | ||
| batch_selfies_to_flat_hot, | ||
| encoding_to_selfies, | ||
| selfies_to_encoding | ||
| ) | ||
| from .utils import ( | ||
| from .utils.selfies_utils import ( | ||
| get_alphabet_from_selfies, | ||
| len_selfies, | ||
| split_selfies, | ||
| selfies_to_encoding, | ||
| batch_selfies_to_flat_hot, | ||
| encoding_to_selfies, | ||
| batch_flat_hot_to_selfies, | ||
| split_selfies | ||
| ) |
+166
-317
@@ -1,372 +0,221 @@ | ||
| from collections import OrderedDict | ||
| from typing import Dict, Iterable, List, Optional, Tuple, Union | ||
| import warnings | ||
| from selfies.grammar_rules import (get_bond_from_num, | ||
| get_hypervalent_constraints, | ||
| get_n_from_symbols, get_next_branch_state, | ||
| get_next_state, get_num_from_bond, | ||
| get_octet_rule_constraints, | ||
| get_semantic_constraints, | ||
| set_semantic_constraints) | ||
| from selfies.compatibility import modernize_symbol | ||
| from selfies.exceptions import DecoderError | ||
| from selfies.grammar_rules import ( | ||
| get_index_from_selfies, | ||
| next_atom_state, | ||
| next_branch_state, | ||
| next_ring_state, | ||
| process_atom_symbol, | ||
| process_branch_symbol, | ||
| process_ring_symbol | ||
| ) | ||
| from selfies.mol_graph import MolecularGraph | ||
| from selfies.utils.selfies_utils import split_selfies | ||
| from selfies.utils.smiles_utils import mol_to_smiles | ||
| def decoder(selfies: str, | ||
| print_error: bool = False, | ||
| constraints: Optional[str] = None) -> Optional[str]: | ||
| """Translates a SELFIES into a SMILES. | ||
| def decoder(selfies: str, compatible: bool = False) -> str: | ||
| """Translates a SELFIES string into its corresponding SMILES string. | ||
| The SELFIES to SMILES translation operates based on the :mod:`selfies` | ||
| grammar rules, which can be configured using | ||
| :func:`selfies.set_semantic_constraints`. Given the appropriate settings, | ||
| the decoded SMILES will always be syntactically and semantically correct. | ||
| That is, the output SMILES will satisfy the specified bond constraints. | ||
| Additionally, :func:`selfies.decoder` will attempt to preserve the | ||
| atom and branch order of the input SELFIES. | ||
| This translation is deterministic but depends on the current semantic | ||
| constraints. The output SMILES string is guaranteed to be syntatically | ||
| correct and guaranteed to represent a molecule that obeys the | ||
| semantic constraints. | ||
| :param selfies: the SELFIES to be translated. | ||
| :param print_error: if True, error messages will be printed to console. | ||
| Defaults to False. | ||
| :param constraints: if ``'octet_rule'`` or ``'hypervalent'``, | ||
| the corresponding preset bond constraints will be used instead. | ||
| If ``None``, :func:`selfies.decoder` will use the | ||
| currently configured bond constraints. Defaults to ``None``. | ||
| :return: the SMILES translation of ``selfies``. If an error occurs, | ||
| and ``selfies`` cannot be translated, ``None`` is returned instead. | ||
| :param selfies: the SELFIES string to be translated. | ||
| :param compatible: if ``True``, this function will accept SELFIES strings | ||
| containing depreciated symbols from previous releases. However, this | ||
| function may behave differently than in previous major relases, | ||
| and should not be treated as backard compatible. | ||
| Defaults to ``False``. | ||
| :return: a SMILES string derived from the input SELFIES string. | ||
| :raises DecoderError: if the input SELFIES string is malformed. | ||
| :Example: | ||
| >>> import selfies | ||
| >>> selfies.decoder('[C][=C][F]') | ||
| >>> import selfies as sf | ||
| >>> sf.decoder('[C][=C][F]') | ||
| 'C=CF' | ||
| .. seealso:: The | ||
| `"octet_rule" <https://en.wikipedia.org/wiki/Octet_rule>`_ | ||
| and | ||
| `"hypervalent" <https://en.wikipedia.org/wiki/Hypervalent_molecule>`_ | ||
| preset bond constraints | ||
| can be viewed with :func:`selfies.get_octet_rule_constraints` and | ||
| :func:`selfies.get_hypervalent_constraints`, respectively. These | ||
| presets are variants of the "default" bond constraints, which can | ||
| be viewed with :func:`selfies.get_default_constraints`. Their | ||
| differences can be summarized as follows: | ||
| * def. : ``Cl``, ``Br``, ``I``: 1, ``N``: 3, ``P``: 5, ``P+1``: 6, ``P-1``: 4, ``S``: 6, ``S+1``: 7, ``S-1``: 5 | ||
| * oct. : ``Cl``, ``Br``, ``I``: 1, ``N``: 3, ``P``: 3, ``P+1``: 4, ``P-1``: 2, ``S``: 2, ``S+1``: 3, ``S-1``: 1 | ||
| * hyp. : ``Cl``, ``Br``, ``I``: 7, ``N``: 5, ``P``: 5, ``P+1``: 6, ``P-1``: 4, ``S``: 6, ``S+1``: 7, ``S-1``: 5 | ||
| """ | ||
| old_constraints = get_semantic_constraints() | ||
| if constraints is None: | ||
| pass | ||
| elif constraints == 'octet_rule': | ||
| set_semantic_constraints(get_octet_rule_constraints()) | ||
| elif constraints == 'hypervalent': | ||
| set_semantic_constraints(get_hypervalent_constraints()) | ||
| else: | ||
| raise ValueError("unrecognized constraint type") | ||
| if compatible: | ||
| msg = "\nselfies.decoder() may behave differently than in previous " \ | ||
| "major releases. We recommend using SELFIES that are up to date." | ||
| warnings.warn(msg, stacklevel=2) | ||
| try: | ||
| all_smiles = [] # process dot-separated fragments separately | ||
| mol = MolecularGraph() | ||
| for s in selfies.split("."): | ||
| smiles = _translate_selfies(s) | ||
| rings = [] | ||
| for s in selfies.split("."): | ||
| _derive_mol_from_symbols( | ||
| symbol_iter=_tokenize_selfies(s, compatible), | ||
| mol=mol, | ||
| selfies=selfies, | ||
| max_derive=float("inf"), | ||
| init_state=0, | ||
| root_atom=None, | ||
| rings=rings | ||
| ) | ||
| _form_rings_bilocally(mol, rings) | ||
| return mol_to_smiles(mol) | ||
| if smiles != "": # prevent malformed dots (e.g. [C]..[C], .[C][C]) | ||
| all_smiles.append(smiles) | ||
| if constraints is not None: # restore old constraints | ||
| set_semantic_constraints(old_constraints) | ||
| def _tokenize_selfies(selfies, compatible): | ||
| if isinstance(selfies, str): | ||
| symbol_iter = split_selfies(selfies) | ||
| elif isinstance(selfies, list): | ||
| symbol_iter = selfies | ||
| else: | ||
| raise ValueError() # should not happen | ||
| return '.'.join(all_smiles) | ||
| try: | ||
| for symbol in symbol_iter: | ||
| if symbol == "[nop]": | ||
| continue | ||
| if compatible: | ||
| symbol = modernize_symbol(symbol) | ||
| yield symbol | ||
| except ValueError as err: | ||
| if constraints is not None: # restore old constraints | ||
| set_semantic_constraints(old_constraints) | ||
| raise DecoderError(str(err)) from None | ||
| if print_error: | ||
| print("Decoding error '{}': {}.".format(selfies, err)) | ||
| return None | ||
| def _parse_selfies(selfies: str) -> Iterable[str]: | ||
| """Parses a SELFIES into its symbols. | ||
| A generator, which parses a SELFIES and yields its symbols | ||
| one-by-one. When no symbols are left in the SELFIES, the empty | ||
| string is infinitely yielded. As a precondition, the input SELFIES contains | ||
| no dots, so all symbols are enclosed by square brackets, e.g. [X]. | ||
| :param selfies: the SElFIES string to be parsed. | ||
| :return: an iterable of the symbols of the SELFIES. | ||
| """ | ||
| left_idx = selfies.find('[') | ||
| while 0 <= left_idx < len(selfies): | ||
| right_idx = selfies.find(']', left_idx + 1) | ||
| if (selfies[left_idx] != '[') or (right_idx == -1): | ||
| raise ValueError("malformed SELIFES, " | ||
| "misplaced or missing brackets") | ||
| next_symbol = selfies[left_idx: right_idx + 1] | ||
| left_idx = right_idx + 1 | ||
| if next_symbol != '[nop]': # skip [nop] | ||
| yield next_symbol | ||
| while True: # no more symbols left | ||
| yield '' | ||
| def _parse_selfies_symbols(selfies_symbols: List[str]) -> Iterable[str]: | ||
| """Equivalent to ``_parse_selfies``, except the input SELFIES is presented | ||
| as a list of SELFIES symbols, as opposed to a string. | ||
| :param selfies_symbols: a SELFIES represented as a list of SELFIES symbols. | ||
| :return: an iterable of the symbols of the SELFIES. | ||
| """ | ||
| for symbol in selfies_symbols: | ||
| if symbol != '[nop]': | ||
| yield symbol | ||
| while True: | ||
| yield '' | ||
| def _translate_selfies(selfies: str) -> str: | ||
| """A helper for ``selfies.decoder``, which translates a SELFIES into a | ||
| SMILES (assuming the input SELFIES contains no dots). | ||
| :param selfies: the SELFIES to be translated. | ||
| :return: the SMILES translation of the SELFIES. | ||
| """ | ||
| selfies_gen = _parse_selfies(selfies) | ||
| # derived[i] is a list with three elements: | ||
| # (1) a string representing the i-th derived atom, and its connecting | ||
| # bond (e.g. =C, #N, N, C are all possible) | ||
| # (2) the number of available bonds the i-th atom has to make | ||
| # (3) the index of the previously derived atom that the i-th derived | ||
| # atom is bonded to | ||
| # Example: if the 6-th derived atom was 'C', had 2 available bonds, | ||
| # and was connected to the 5-th derived atom by a double bond, then | ||
| # derived[6] = ['=C', 2, 5] | ||
| derived = [] | ||
| # each item of <branches> is a key-value pair of indices that represents | ||
| # the branches to be made. If a branch starts at the i-th derived atom | ||
| # and ends at the j-th derived atom, then branches[i] = j. No two | ||
| # branches should start at the same atom, e.g. C((C)Cl)C | ||
| branches = {} | ||
| # each element of <rings> is a tuple of size three that represents the | ||
| # rings to be made, in the same order they appear in the SELFIES (left | ||
| # to right). If the i-th ring is between the j-th and k-th derived atoms | ||
| # (j <= k) and has bond symbol s ('=', '#', '\', etc.), then | ||
| # rings[i] = (j, k, s). | ||
| rings = [] | ||
| _translate_selfies_derive(selfies_gen, 0, derived, -1, branches, rings) | ||
| _form_rings_bilocally(derived, rings) | ||
| # create branches | ||
| for lb, rb in branches.items(): | ||
| derived[lb][0] = '(' + derived[lb][0] | ||
| derived[rb][0] += ')' | ||
| smiles = "" | ||
| for s, _, _ in derived: # construct SMILES from <derived> | ||
| smiles += s | ||
| return smiles | ||
| # flake8: noqa: C901 | ||
| # noinspection PyTypeChecker | ||
| def _translate_selfies_derive(selfies_gen: Iterable[str], | ||
| init_state: int, | ||
| derived: List[List[Union[str, int]]], | ||
| prev_idx: int, | ||
| branches: Dict[int, int], | ||
| rings: List[Tuple[int, int, str]]) -> None: | ||
| """Recursive helper for _translate_selfies. | ||
| Derives the SMILES symbols one-by-one from a SELFIES, and | ||
| populates derived, branches, and rings. The main chain and side branches | ||
| of the SELFIES are translated recursively. Rings are not actually | ||
| translated, but saved to the rings list to be added later. | ||
| :param selfies_gen: an iterable of the symbols of the SELFIES to be | ||
| translated, created by ``_parse_selfies``. | ||
| :param init_state: the initial derivation state. | ||
| :param derived: see ``derived`` in ``_translate_selfies``. | ||
| :param prev_idx: the index of the previously derived atom, or -1, | ||
| if no atoms have been derived yet. | ||
| :param branches: see ``branches`` in ``_translate_selfies``. | ||
| :param rings: see ``rings`` in ``_translate_selfies``. | ||
| :return: ``None``. | ||
| """ | ||
| curr_symbol = next(selfies_gen) | ||
| def _derive_mol_from_symbols( | ||
| symbol_iter, mol, selfies, max_derive, | ||
| init_state, root_atom, rings | ||
| ): | ||
| n_derived = 0 | ||
| state = init_state | ||
| prev_atom = root_atom | ||
| while curr_symbol != '' and state >= 0: | ||
| while (state is not None) and (n_derived < max_derive): | ||
| # Case 1: Branch symbol (e.g. [Branch1_2]) | ||
| if 'Branch' in curr_symbol: | ||
| try: # retrieve next symbol | ||
| symbol = next(symbol_iter) | ||
| n_derived += 1 | ||
| except StopIteration: | ||
| break | ||
| branch_init_state, new_state = \ | ||
| get_next_branch_state(curr_symbol, state) | ||
| # Case 1: Branch symbol (e.g. [Branch1]) | ||
| if "ch" == symbol[-4:-2]: | ||
| if state <= 1: # state = 0, 1 | ||
| pass # ignore no symbols | ||
| output = process_branch_symbol(symbol) | ||
| if output is None: | ||
| _raise_decoder_error(selfies, symbol) | ||
| btype, n = output | ||
| if state <= 1: | ||
| next_state = state | ||
| else: | ||
| L = int(curr_symbol[-4]) # corresponds to [BranchL_X] | ||
| L_symbols = [] | ||
| for _ in range(L): | ||
| L_symbols.append(next(selfies_gen)) | ||
| binit_state, next_state = next_branch_state(btype, state) | ||
| N = get_n_from_symbols(*L_symbols) | ||
| Q = _read_index_from_selfies(symbol_iter, n_symbols=n) | ||
| n_derived += n + _derive_mol_from_symbols( | ||
| symbol_iter, mol, selfies, (Q + 1), | ||
| init_state=binit_state, root_atom=prev_atom, rings=rings | ||
| ) | ||
| branch_symbols = [] | ||
| for _ in range(N + 1): | ||
| branch_symbols.append(next(selfies_gen)) | ||
| branch_gen = _parse_selfies_symbols(branch_symbols) | ||
| branch_start = len(derived) | ||
| _translate_selfies_derive(branch_gen, branch_init_state, | ||
| derived, prev_idx, branches, rings) | ||
| branch_end = len(derived) - 1 | ||
| # resolve C((C)Cl)C --> C(C)(Cl)C | ||
| while branch_start in branches: | ||
| branch_start = branches[branch_start] + 1 | ||
| # finally, register the branch in branches | ||
| if branch_start <= branch_end: | ||
| branches[branch_start] = branch_end | ||
| # Case 2: Ring symbol (e.g. [Ring2]) | ||
| elif 'Ring' in curr_symbol: | ||
| elif "ng" == symbol[-4:-2]: | ||
| new_state = state | ||
| output = process_ring_symbol(symbol) | ||
| if output is None: | ||
| _raise_decoder_error(selfies, symbol) | ||
| ring_type, n, stereo = output | ||
| if state == 0: | ||
| pass # ignore no symbols | ||
| next_state = state | ||
| else: | ||
| L = int(curr_symbol[-2]) # corresponds to [RingL] | ||
| L_symbols = [] | ||
| for _ in range(L): | ||
| L_symbols.append(next(selfies_gen)) | ||
| ring_order, next_state = next_ring_state(ring_type, state) | ||
| bond_info = (ring_order, stereo) | ||
| N = get_n_from_symbols(*L_symbols) | ||
| Q = _read_index_from_selfies(symbol_iter, n_symbols=n) | ||
| n_derived += n | ||
| lidx = max(0, prev_atom.index - (Q + 1)) | ||
| rings.append((mol.get_atom(lidx), prev_atom, bond_info)) | ||
| left_idx = max(0, prev_idx - (N + 1)) | ||
| right_idx = prev_idx | ||
| # Case 3: [epsilon] | ||
| elif "eps" in symbol: | ||
| next_state = 0 if (state == 0) else None | ||
| bond_symbol = '' | ||
| if curr_symbol[1:5] == 'Expl': | ||
| bond_symbol = curr_symbol[5] | ||
| rings.append((left_idx, right_idx, bond_symbol)) | ||
| # Case 3: regular symbol (e.g. [N], [=C], [F]) | ||
| # Case 4: regular symbol (e.g. [N], [=C], [F]) | ||
| else: | ||
| new_symbol, new_state = get_next_state(curr_symbol, state) | ||
| if new_symbol != '': # in case of [epsilon] | ||
| derived.append([new_symbol, new_state, prev_idx]) | ||
| output = process_atom_symbol(symbol) | ||
| if output is None: | ||
| _raise_decoder_error(selfies, symbol) | ||
| (bond_order, stereo), atom = output | ||
| cap = atom.bonding_capacity | ||
| if prev_idx >= 0: | ||
| bond_num = get_num_from_bond(new_symbol[0]) | ||
| derived[prev_idx][1] -= bond_num | ||
| bond_order, next_state = next_atom_state(bond_order, cap, state) | ||
| if bond_order == 0: | ||
| if state == 0: | ||
| mol.add_atom(atom, True) | ||
| else: | ||
| mol.add_atom(atom) | ||
| src, dst = prev_atom.index, atom.index | ||
| mol.add_bond(src=src, dst=dst, order=bond_order, stereo=stereo) | ||
| prev_atom = atom | ||
| prev_idx = len(derived) - 1 | ||
| if next_state is None: | ||
| break | ||
| state = next_state | ||
| curr_symbol = next(selfies_gen) # update symbol and state | ||
| state = new_state | ||
| while n_derived < max_derive: # consume remaining tokens | ||
| try: | ||
| next(symbol_iter) | ||
| n_derived += 1 | ||
| except StopIteration: | ||
| break | ||
| return n_derived | ||
| def _form_rings_bilocally(derived: List[List[Union[str, int]]], | ||
| rings: List[Tuple[int, int, str]]) -> None: | ||
| """Forms all the rings specified by the rings list, in first-to-last order, | ||
| by updating derived. | ||
| :param derived: see ``derived`` in ``_translate_selfies``. | ||
| :param rings: see ``rings`` in ``_translate_selfies``. | ||
| :return: ``None``. | ||
| """ | ||
| def _raise_decoder_error(selfies, invalid_symbol): | ||
| err_msg = "invalid symbol '{}'\n\tSELFIES: {}".format( | ||
| invalid_symbol, selfies | ||
| ) | ||
| raise DecoderError(err_msg) | ||
| # due to the behaviour of allowing multiple rings between the same atom | ||
| # pair, or rings between already bonded atoms, we first resolve all rings | ||
| # so that only valid rings are left and placed into <ring_locs>. | ||
| ring_locs = OrderedDict() | ||
| for left_idx, right_idx, bond_symbol in rings: | ||
| def _read_index_from_selfies(symbol_iter, n_symbols): | ||
| index_symbols = [] | ||
| for _ in range(n_symbols): | ||
| try: | ||
| index_symbols.append(next(symbol_iter)) | ||
| except StopIteration: | ||
| index_symbols.append(None) | ||
| return get_index_from_selfies(*index_symbols) | ||
| if left_idx == right_idx: # ring to the same atom forbidden | ||
| continue | ||
| left_end = derived[left_idx] | ||
| right_end = derived[right_idx] | ||
| bond_num = get_num_from_bond(bond_symbol) | ||
| def _form_rings_bilocally(mol, rings): | ||
| rings_made = [0] * len(mol) | ||
| if left_end[1] <= 0 or right_end[1] <= 0: | ||
| continue # no room for bond | ||
| for latom, ratom, bond_info in rings: | ||
| lidx, ridx = latom.index, ratom.index | ||
| if bond_num > min(left_end[1], right_end[1]): | ||
| bond_num = min(left_end[1], right_end[1]) | ||
| bond_symbol = get_bond_from_num(bond_num) | ||
| if lidx == ridx: # ring to the same atom forbidden | ||
| continue | ||
| # ring is formed between two atoms that are already bonded | ||
| # e.g. CC1C1C --> CC=CC | ||
| if left_idx == right_end[2]: | ||
| order, (lstereo, rstereo) = bond_info | ||
| lfree = latom.bonding_capacity - mol.get_bond_count(lidx) | ||
| rfree = ratom.bonding_capacity - mol.get_bond_count(ridx) | ||
| right_symbol = right_end[0] | ||
| if lfree <= 0 or rfree <= 0: | ||
| continue # no room for ring bond | ||
| order = min(order, lfree, rfree) | ||
| if right_symbol[0] in {'-', '/', '\\', '=', '#'}: | ||
| old_bond = right_symbol[0] | ||
| else: | ||
| old_bond = '' | ||
| if mol.has_bond(a=lidx, b=ridx): | ||
| bond = mol.get_dirbond(src=lidx, dst=ridx) | ||
| new_order = min(order + bond.order, 3) | ||
| mol.update_bond_order(a=lidx, b=ridx, new_order=new_order) | ||
| # update bond multiplicity and symbol | ||
| new_bond_num = min(bond_num + get_num_from_bond(old_bond), 3) | ||
| new_bond_symbol = get_bond_from_num(new_bond_num) | ||
| right_end[0] = new_bond_symbol + right_end[0][len(old_bond):] | ||
| # ring is formed between two atoms that are not bonded, e.g. C1CC1C | ||
| else: | ||
| loc = (left_idx, right_idx) | ||
| if loc in ring_locs: | ||
| # a ring is formed between two atoms that are have previously | ||
| # been bonded by a ring, so ring bond multiplicity is updated | ||
| new_bond_num = min(bond_num | ||
| + get_num_from_bond(ring_locs[loc]), 3) | ||
| new_bond_symbol = get_bond_from_num(new_bond_num) | ||
| ring_locs[loc] = new_bond_symbol | ||
| else: | ||
| ring_locs[loc] = bond_symbol | ||
| left_end[1] -= bond_num | ||
| right_end[1] -= bond_num | ||
| # finally, use <ring_locs> to add all the rings into <derived> | ||
| ring_counter = 1 | ||
| for (left_idx, right_idx), bond_symbol in ring_locs.items(): | ||
| ring_id = str(ring_counter) | ||
| if len(ring_id) == 2: | ||
| ring_id = "%" + ring_id | ||
| ring_counter += 1 # increment | ||
| derived[left_idx][0] += bond_symbol + ring_id | ||
| derived[right_idx][0] += bond_symbol + ring_id | ||
| mol.add_ring_bond( | ||
| a=lidx, a_stereo=lstereo, a_pos=rings_made[lidx], | ||
| b=ridx, b_stereo=rstereo, b_pos=rings_made[ridx], | ||
| order=order | ||
| ) | ||
| rings_made[lidx] += 1 | ||
| rings_made[ridx] += 1 |
+155
-217
@@ -1,265 +0,203 @@ | ||
| from typing import Dict, Iterable, List, Optional, Tuple | ||
| from selfies.exceptions import EncoderError, SMILESParserError | ||
| from selfies.grammar_rules import get_selfies_from_index | ||
| from selfies.utils.linked_list import SinglyLinkedList | ||
| from selfies.utils.smiles_utils import ( | ||
| atom_to_smiles, | ||
| bond_to_smiles, | ||
| smiles_to_mol | ||
| ) | ||
| from selfies.grammar_rules import get_num_from_bond, get_symbols_from_n | ||
| from selfies.kekulize import kekulize_parser | ||
| def encoder(smiles: str, strict: bool = True) -> str: | ||
| """Translates a SMILES string into its corresponding SELFIES string. | ||
| def encoder(smiles: str, print_error: bool = False) -> Optional[str]: | ||
| """Translates a SMILES into a SELFIES. | ||
| This translation is deterministic and does not depend on the | ||
| current semantic constraints. Additionally, it preserves the atom order | ||
| of the input SMILES string; thus, one could generate randomized SELFIES | ||
| strings by generating randomized SMILES strings, and then translating them. | ||
| The SMILES to SELFIES translation occurs independently of the SELFIES | ||
| alphabet and grammar. Thus, :func:`selfies.encoder` will work regardless of | ||
| the alphabet and grammar rules that :py:mod:`selfies` is operating on, | ||
| assuming the input is a valid SMILES. Additionally, :func:`selfies.encoder` | ||
| preserves the atom and branch order of the input SMILES; thus, one | ||
| could generate random SELFIES corresponding to the same molecule by | ||
| generating random SMILES, and then translating them. | ||
| By nature of SELFIES, it is impossible to represent molecules that | ||
| violate the current semantic constraints as SELFIES strings. | ||
| Thus, we provide the ``strict`` flag to guard against such cases. If | ||
| ``strict=True``, then this function will raise a | ||
| :class:`selfies.EncoderError` if the input SMILES string represents | ||
| a molecule that violates the semantic constraints. If | ||
| ``strict=False``, then this function will not raise any error; however, | ||
| calling :func:`selfies.decoder` on a SELFIES string generated this | ||
| way will *not* be guaranteed to recover a SMILES string representing | ||
| the original molecule. | ||
| However, encoding and then decoding a SMILES may not necessarily yield | ||
| the original SMILES. Reasons include: | ||
| :param smiles: the SMILES string to be translated. It is recommended to | ||
| use RDKit to check that the strings passed into this function | ||
| are valid SMILES strings. | ||
| :param strict: if ``True``, this function will check that the | ||
| input SMILES string obeys the semantic constraints. | ||
| Defaults to ``True``. | ||
| :return: a SELFIES string translated from the input SMILES string. | ||
| :raises EncoderError: if the input SMILES string is invalid, | ||
| cannot be kekulized, or violates the semantic constraints with | ||
| ``strict=True``. | ||
| 1. SMILES with aromatic symbols are automatically | ||
| Kekulized before being translated. | ||
| 2. SMILES that violate the bond constraints specified by | ||
| :mod:`selfies` will be successfully encoded by | ||
| :func:`selfies.encoder`, but then decoded into a new molecule | ||
| that satisfies the constraints. | ||
| 3. The exact ring numbering order is lost in :func:`selfies.encoder`, | ||
| and cannot be reconstructed by :func:`selfies.decoder`. | ||
| Finally, note that :func:`selfies.encoder` does **not** check if the input | ||
| SMILES is valid, and should not be expected to reject invalid inputs. | ||
| It is recommended to use RDKit to first verify that the SMILES are | ||
| valid. | ||
| :param smiles: the SMILES to be translated. | ||
| :param print_error: if True, error messages will be printed to console. | ||
| Defaults to False. | ||
| :return: the SELFIES translation of ``smiles``. If an error occurs, | ||
| and ``smiles`` cannot be translated, :code:`None` is returned instead. | ||
| :Example: | ||
| >>> import selfies | ||
| >>> selfies.encoder('C=CF') | ||
| >>> import selfies as sf | ||
| >>> sf.encoder("C=CF") | ||
| '[C][=C][F]' | ||
| .. note:: Currently, :func:`selfies.encoder` does not support the | ||
| following types of SMILES: | ||
| .. note:: This function does not currently support SMILES with: | ||
| * SMILES using ring numbering across a dot-bond symbol | ||
| to specify bonds, e.g. ``C1.C2.C12`` (propane) or | ||
| ``c1cc([O-].[Na+])ccc1`` (sodium phenoxide). | ||
| * SMILES with ring numbering between atoms that are over | ||
| ``16 ** 3 = 4096`` atoms apart. | ||
| * SMILES using the wildcard symbol ``*``. | ||
| * SMILES using chiral specifications other than ``@`` and ``@@``. | ||
| * The wildcard symbol ``*``. | ||
| * The quadruple bond symbol ``$``. | ||
| * Chirality specifications other than ``@`` and ``@@``. | ||
| * Ring bonds across a dot symbol (e.g. ``c1cc([O-].[Na+])ccc1``) or | ||
| ring bonds between atoms that are over 4000 atoms apart. | ||
| Although SELFIES does not have aromatic symbols, this function | ||
| *does* support aromatic SMILES strings by internally kekulizing them | ||
| before translation. | ||
| """ | ||
| try: | ||
| if '*' in smiles: | ||
| raise ValueError("wildcard atom '*' not supported") | ||
| mol = smiles_to_mol(smiles) | ||
| except SMILESParserError as err: | ||
| err_msg = "failed to parse input\n\tSMILES: {}".format(smiles) | ||
| raise EncoderError(err_msg) from err | ||
| all_selfies = [] # process dot-separated fragments separately | ||
| for s in smiles.split("."): | ||
| all_selfies.append(_translate_smiles(s)) | ||
| return '.'.join(all_selfies) | ||
| if not mol.kekulize(): | ||
| err_msg = "kekulization failed\n\tSMILES: {}".format(smiles) | ||
| raise EncoderError(err_msg) | ||
| except ValueError as err: | ||
| if print_error: | ||
| print("Encoding error '{}': {}.".format(smiles, err)) | ||
| return None | ||
| if strict: | ||
| _check_bond_constraints(mol, smiles) | ||
| # invert chirality of atoms where necessary, | ||
| # such that they are restored when the SELFIES is decoded | ||
| for atom in mol.get_atoms(): | ||
| if ((atom.chirality is not None) | ||
| and mol.has_out_ring_bond(atom.index) | ||
| and _should_invert_chirality(mol, atom)): | ||
| atom.invert_chirality() | ||
| ATOM_TYPE = 1 | ||
| BRANCH_TYPE = 2 | ||
| RING_TYPE = 3 | ||
| fragments = [] | ||
| for root in mol.get_roots(): | ||
| derived = list(_fragment_to_selfies(mol, None, root)) | ||
| fragments.append("".join(derived)) | ||
| return ".".join(fragments) | ||
| def _parse_smiles(smiles: str) -> Iterable[Tuple[str, str, int]]: | ||
| """Parses a SMILES into its symbols. | ||
| def _check_bond_constraints(mol, smiles): | ||
| errors = [] | ||
| A generator, which parses a SMILES string and returns its symbol(s) | ||
| one-by-one as a tuple of: | ||
| (1) the bond symbol connecting the current atom/ring/branch symbol | ||
| to the previous atom/ring/branch symbol (e.g. '=', '', '#') | ||
| (2) the atom/ring/branch symbol as a string (e.g. 'C', '12', '(') | ||
| (3) the type of the symbol in (2), represented as an integer that is | ||
| either ``ATOM_TYPE``, ``BRANCH_TYPE``, and ``RING_TYPE``. | ||
| As a precondition, we also assume ``smiles`` has no dots in it. | ||
| for atom in mol.get_atoms(): | ||
| bond_cap = atom.bonding_capacity | ||
| bond_count = mol.get_bond_count(atom.index) | ||
| if bond_count > bond_cap: | ||
| errors.append((atom_to_smiles(atom), bond_count, bond_cap)) | ||
| :param smiles: the SMILES to be parsed. | ||
| :return: an iterable of the symbol(s) of the SELFIES along with | ||
| their types. | ||
| """ | ||
| if errors: | ||
| err_msg = "input violates the currently-set semantic constraints\n" \ | ||
| "\tSMILES: {}\n" \ | ||
| "\tErrors:\n".format(smiles) | ||
| for e in errors: | ||
| err_msg += "\t[{:} with {} bond(s) - " \ | ||
| "a max. of {} bond(s) was specified]\n".format(*e) | ||
| raise EncoderError(err_msg) | ||
| i = 0 | ||
| while 0 <= i < len(smiles): | ||
| def _should_invert_chirality(mol, atom): | ||
| out_bonds = mol.get_out_dirbonds(atom.index) | ||
| bond = '' | ||
| if smiles[i] in {'-', '/', '\\', '=', '#', ":"}: | ||
| bond = smiles[i] | ||
| i += 1 | ||
| if smiles[i].isalpha(): # organic subset elements | ||
| if smiles[i: i + 2] in ('Br', 'Cl'): # two letter elements | ||
| symbol = smiles[i: i + 2] | ||
| symbol_type = ATOM_TYPE | ||
| i += 2 | ||
| else: | ||
| symbol = smiles[i] # one letter elements (e.g. C, N, ...) | ||
| symbol_type = ATOM_TYPE | ||
| i += 1 | ||
| elif smiles[i] in ('(', ')'): # open and closed branch brackets | ||
| bond = smiles[i + 1: i + 2] | ||
| symbol = smiles[i] | ||
| symbol_type = BRANCH_TYPE | ||
| i += 1 | ||
| elif smiles[i] == '[': # atoms encased in brackets (e.g. [NH]) | ||
| r_idx = smiles.find(']', i + 1) | ||
| symbol = smiles[i: r_idx + 1] | ||
| symbol_type = ATOM_TYPE | ||
| i = r_idx + 1 | ||
| if r_idx == -1: | ||
| raise ValueError("malformed SMILES, missing ']'") | ||
| # quick chirality specification check | ||
| chiral_i = symbol.find('@') | ||
| if symbol[chiral_i + 1].isalpha() and symbol[chiral_i + 1] != 'H': | ||
| raise ValueError("chiral specification '{}' not supported" | ||
| .format(symbol)) | ||
| elif smiles[i].isdigit(): # one-digit ring number | ||
| symbol = smiles[i] | ||
| symbol_type = RING_TYPE | ||
| i += 1 | ||
| elif smiles[i] == '%': # two-digit ring number (e.g. %12) | ||
| symbol = smiles[i + 1: i + 3] | ||
| symbol_type = RING_TYPE | ||
| i += 3 | ||
| # 1. rings whose right number are bonded to this atom (e.g. ...1...X1) | ||
| # 2. rings whose left number are bonded to this atom (e.g. X1...1...) | ||
| # 3. branches and other (e.g. X(...)...) | ||
| partition = [[], [], []] | ||
| for i, bond in enumerate(out_bonds): | ||
| if not bond.ring_bond: | ||
| partition[2].append(i) | ||
| elif bond.src < bond.dst: | ||
| partition[1].append(i) | ||
| else: | ||
| raise ValueError("unrecognized symbol '{}'".format(smiles[i])) | ||
| partition[0].append(i) | ||
| partition[1].sort(key=lambda x: out_bonds[x].dst) | ||
| yield bond, symbol, symbol_type | ||
| # construct permutation | ||
| perm = partition[0] + partition[1] + partition[2] | ||
| count = 0 | ||
| for i in range(len(perm)): | ||
| for j in range(i + 1, len(perm)): | ||
| if perm[i] > perm[j]: | ||
| count += 1 | ||
| return count % 2 != 0 # if odd permutation, should invert chirality | ||
| def _translate_smiles(smiles: str) -> str: | ||
| """A helper for ``selfies.encoder``, which translates a SMILES into a | ||
| SELFIES (assuming the input SMILES contains no dots). | ||
| def _fragment_to_selfies(mol, bond_into_root, root): | ||
| derived = SinglyLinkedList() | ||
| :param smiles: the SMILES to be translated. | ||
| :return: the SELFIES translation of SMILES. | ||
| """ | ||
| bond_into_curr, curr = bond_into_root, root | ||
| while True: | ||
| curr_atom = mol.get_atom(curr) | ||
| derived.append(_atom_to_selfies(bond_into_curr, curr_atom)) | ||
| smiles_gen = _parse_smiles(smiles) | ||
| out_bonds = mol.get_out_dirbonds(curr) | ||
| for i, bond in enumerate(out_bonds): | ||
| char_set = set(smiles) | ||
| if any(c in char_set for c in ['c', 'n', 'o', 'p', 'a', 's']): | ||
| smiles_gen = kekulize_parser(smiles_gen) | ||
| if bond.ring_bond: | ||
| if bond.src < bond.dst: | ||
| continue | ||
| # a simple mutable counter to track which atom was the i-th derived atom | ||
| derive_counter = [0] | ||
| rev_bond = mol.get_dirbond(src=bond.dst, dst=bond.src) | ||
| ring_len = bond.src - bond.dst | ||
| Q_as_symbols = get_selfies_from_index(ring_len - 1) | ||
| ring_symbol = "[{}Ring{}]".format( | ||
| _ring_bonds_to_selfies(rev_bond, bond), | ||
| len(Q_as_symbols) | ||
| ) | ||
| # a dictionary to keep track of the rings to be made. If a ring with id | ||
| # X is connected to the i-th and j-th derived atoms (i < j) with bond | ||
| # symbol s, then after the i-th atom is derived, rings[X] = (s, i). | ||
| # As soon as the j-th atom is derived, rings[X] is removed from <rings>, | ||
| # and the ring is made. | ||
| rings = {} | ||
| derived.append(ring_symbol) | ||
| for symbol in Q_as_symbols: | ||
| derived.append(symbol) | ||
| selfies, _ = _translate_smiles_derive(smiles_gen, rings, derive_counter) | ||
| elif i == len(out_bonds) - 1: | ||
| bond_into_curr, curr = bond, bond.dst | ||
| if rings: | ||
| raise ValueError("malformed ring numbering or ring numbering " | ||
| "across a dot symbol") | ||
| return selfies | ||
| def _translate_smiles_derive(smiles_gen: Iterable[Tuple[str, str, int]], | ||
| rings: Dict[int, Tuple[str, int]], | ||
| counter: List[int]) -> Tuple[str, int]: | ||
| """Recursive helper for _translate_smiles. | ||
| Derives the SELFIES from a SMILES, and returns a tuple of (1) the | ||
| translated SELFIES and (2) the symbol length of the translated SELFIES. | ||
| :param smiles_gen: an iterable of the symbols (and their types) | ||
| of the SMILES to be translated, created by ``_parse_smiles``. | ||
| :param rings: See ``rings`` in ``_translate_smiles``. | ||
| :param counter: a one-element list that serves as a mutable counter. | ||
| See ``derived_counter`` in ``_translate_smiles``. | ||
| :return: A tuple of the translated SELFIES and its symbol length. | ||
| """ | ||
| selfies = "" | ||
| selfies_len = 0 | ||
| prev_idx = -1 | ||
| for bond, symbol, symbol_type in smiles_gen: | ||
| if bond == '-': # ignore explicit single bonds | ||
| bond = '' | ||
| if symbol_type == ATOM_TYPE: | ||
| if symbol[0] == '[': | ||
| selfies += "[{}{}expl]".format(bond, symbol[1:-1]) | ||
| else: | ||
| selfies += "[{}{}]".format(bond, symbol) | ||
| prev_idx = counter[0] | ||
| counter[0] += 1 | ||
| selfies_len += 1 | ||
| branch = _fragment_to_selfies(mol, bond, bond.dst) | ||
| Q_as_symbols = get_selfies_from_index(len(branch) - 1) | ||
| branch_symbol = "[{}Branch{}]".format( | ||
| _bond_to_selfies(bond, show_stereo=False), | ||
| len(Q_as_symbols) | ||
| ) | ||
| elif symbol_type == BRANCH_TYPE: | ||
| if symbol == '(': | ||
| derived.append(branch_symbol) | ||
| for symbol in Q_as_symbols: | ||
| derived.append(symbol) | ||
| derived.extend(branch) | ||
| # NOTE: looping inside a loop on a generator will produce | ||
| # expected behaviour in this case. | ||
| # end of chain | ||
| if (not out_bonds) or out_bonds[-1].ring_bond: | ||
| break | ||
| branch, branch_len = \ | ||
| _translate_smiles_derive(smiles_gen, rings, counter) | ||
| return derived | ||
| N_as_symbols = get_symbols_from_n(branch_len - 1) | ||
| bond_num = get_num_from_bond(bond) | ||
| selfies += "[Branch{}_{}]".format(len(N_as_symbols), bond_num) | ||
| selfies += ''.join(N_as_symbols) + branch | ||
| selfies_len += 1 + len(N_as_symbols) + branch_len | ||
| def _bond_to_selfies(bond, show_stereo=True): | ||
| if not show_stereo and (bond.order == 1): | ||
| return "" | ||
| return bond_to_smiles(bond) | ||
| else: # symbol == ')' | ||
| break | ||
| else: # symbol_type == RING_TYPE | ||
| ring_id = int(symbol) | ||
| def _ring_bonds_to_selfies(lbond, rbond): | ||
| assert lbond.order == rbond.order | ||
| if ring_id in rings: | ||
| left_bond, left_end = rings.pop(ring_id) | ||
| right_bond, right_end = bond, prev_idx | ||
| if (lbond.order != 1) or all(b.stereo is None for b in (lbond, rbond)): | ||
| return _bond_to_selfies(lbond, show_stereo=False) | ||
| else: | ||
| bond_char = "-" if (lbond.stereo is None) else lbond.stereo | ||
| bond_char += "-" if (rbond.stereo is None) else rbond.stereo | ||
| return bond_char | ||
| ring_len = right_end - left_end | ||
| N_as_symbols = get_symbols_from_n(ring_len - 1) | ||
| if left_bond != '': | ||
| selfies += "[Expl{}Ring{}]".format(left_bond, | ||
| len(N_as_symbols)) | ||
| elif right_bond != '': | ||
| selfies += "[Expl{}Ring{}]".format(right_bond, | ||
| len(N_as_symbols)) | ||
| else: | ||
| selfies += "[Ring{}]".format(len(N_as_symbols)) | ||
| selfies += ''.join(N_as_symbols) | ||
| selfies_len += 1 + len(N_as_symbols) | ||
| else: | ||
| rings[ring_id] = (bond, prev_idx) | ||
| return selfies, selfies_len | ||
| def _atom_to_selfies(bond, atom): | ||
| assert not atom.is_aromatic | ||
| bond_char = "" if (bond is None) else _bond_to_selfies(bond) | ||
| return "[{}{}]".format(bond_char, atom_to_smiles(atom, brackets=False)) |
+157
-377
@@ -1,428 +0,208 @@ | ||
| from itertools import product | ||
| from typing import Dict, List, Optional, Set, Tuple | ||
| import functools | ||
| import itertools | ||
| import re | ||
| from typing import Any, List, Optional, Tuple | ||
| default_bond_constraints = { | ||
| 'H': 1, 'F': 1, 'Cl': 1, 'Br': 1, 'I': 1, | ||
| 'O': 2, 'O+1': 3, 'O-1': 1, | ||
| 'N': 3, 'N+1': 4, 'N-1': 2, | ||
| 'C': 4, 'C+1': 5, 'C-1': 3, | ||
| 'P': 5, 'P+1': 6, 'P-1': 4, | ||
| 'S': 6, 'S+1': 7, 'S-1': 5, | ||
| '?': 8 | ||
| } | ||
| octet_rule_bond_constraints = dict(default_bond_constraints) | ||
| octet_rule_bond_constraints.update( | ||
| {'S': 2, 'S+1': 3, 'S-1': 1, 'P': 3, 'P+1': 4, 'P-1': 2} | ||
| from selfies.constants import ( | ||
| ELEMENTS, | ||
| INDEX_ALPHABET, | ||
| INDEX_CODE, | ||
| ORGANIC_SUBSET | ||
| ) | ||
| from selfies.mol_graph import Atom | ||
| from selfies.utils.smiles_utils import smiles_to_bond | ||
| hypervalent_bond_constraints = dict(default_bond_constraints) | ||
| hypervalent_bond_constraints.update( | ||
| {'Cl': 7, 'Br': 7, 'I': 7, 'N': 5} | ||
| ) | ||
| _bond_constraints = default_bond_constraints | ||
| def process_atom_symbol(symbol: str) -> Optional[Tuple[Any, Atom]]: | ||
| try: | ||
| output = _PROCESS_ATOM_CACHE[symbol] | ||
| except KeyError: | ||
| output = _process_atom_selfies_no_cache(symbol) | ||
| if output is None: | ||
| return None | ||
| _PROCESS_ATOM_CACHE[symbol] = output | ||
| bond_info, atom_fac = output | ||
| atom = atom_fac() | ||
| if atom.bonding_capacity < 0: | ||
| return None # too many Hs (e.g. [CH9] | ||
| return bond_info, atom | ||
| def get_semantic_robust_alphabet() -> Set[str]: | ||
| """Returns a subset of all symbols that are semantically constrained | ||
| by :mod:`selfies`. | ||
| These semantic constraints can be configured with | ||
| :func:`selfies.set_semantic_constraints`. | ||
| def process_branch_symbol(symbol: str) -> Optional[Tuple[int, int]]: | ||
| try: | ||
| return _PROCESS_BRANCH_CACHE[symbol] | ||
| except KeyError: | ||
| return None | ||
| :return: a subset of all symbols that are semantically constrained. | ||
| """ | ||
| alphabet_subset = set() | ||
| def process_ring_symbol(symbol: str) -> Optional[Tuple[int, int, Any]]: | ||
| try: | ||
| return _PROCESS_RING_CACHE[symbol] | ||
| except KeyError: | ||
| return None | ||
| organic_subset = {'B', 'C', 'N', 'O', 'S', 'P', 'F', 'Cl', 'Br', 'I'} | ||
| bonds = {'': 1, '=': 2, '#': 3} | ||
| # add atomic symbols | ||
| for (a, c), (b, m) in product(_bond_constraints.items(), bonds.items()): | ||
| if (m > c) or (a == '?'): | ||
| continue | ||
| if a in organic_subset: | ||
| symbol = "[{}{}]".format(b, a) | ||
| else: | ||
| symbol = "[{}{}expl]".format(b, a) | ||
| alphabet_subset.add(symbol) | ||
| # add branch and ring symbols | ||
| for i in range(1, 4): | ||
| alphabet_subset.add("[Ring{}]".format(i)) | ||
| alphabet_subset.add("[Expl=Ring{}]".format(i)) | ||
| for j in range(1, 4): | ||
| alphabet_subset.add("[Branch{}_{}]".format(i, j)) | ||
| return alphabet_subset | ||
| def get_default_constraints() -> Dict[str, int]: | ||
| """Returns the preset "default" bond constraint settings. | ||
| :return: the default constraint settings. | ||
| """ | ||
| global default_bond_constraints | ||
| return dict(default_bond_constraints) | ||
| def get_octet_rule_constraints() -> Dict[str, int]: | ||
| """Returns the preset "octet rule" bond constraint settings. These | ||
| constraints are a harsher version of the default constraints, so that | ||
| the `octet rule <https://en.wikipedia.org/wiki/Octet_rule>`_ | ||
| is obeyed. In particular, ``S`` and ``P`` are | ||
| restricted to a 2 and 3 bond capacity, respectively (and similarly with | ||
| ``S+``, ``S-``, ``P+``, ``P-``). | ||
| :return: the octet rule constraint settings. | ||
| """ | ||
| global octet_rule_bond_constraints | ||
| return dict(octet_rule_bond_constraints) | ||
| def get_hypervalent_constraints() -> Dict[str, int]: | ||
| """Returns the preset "hypervalent" bond constraint settings. These | ||
| constraints are a relaxed version of the default constraints, to allow | ||
| for `hypervalent molecules | ||
| <https://en.wikipedia.org/wiki/Hypervalent_molecule>`_. | ||
| In particular, ``Cl``, ``Br``, and ``I`` | ||
| are relaxed to a 7 bond capacity, and ``N`` is relaxed to a 5 bond | ||
| capacity. | ||
| :return: the hypervalent constraint settings. | ||
| """ | ||
| global hypervalent_bond_constraints | ||
| return dict(hypervalent_bond_constraints) | ||
| def get_semantic_constraints() -> Dict[str, int]: | ||
| """Returns the semantic bond constraints that :mod:`selfies` is currently | ||
| operating on. | ||
| Returned is the argument of the most recent call of | ||
| :func:`selfies.set_semantic_constraints`, or the default bond constraints | ||
| if the function has not been called yet. Once retrieved, it is copied and | ||
| then returned. See :func:`selfies.set_semantic_constraints` for further | ||
| explanation. | ||
| :return: the bond constraints :mod:`selfies` is currently operating on. | ||
| """ | ||
| global _bond_constraints | ||
| return dict(_bond_constraints) | ||
| def set_semantic_constraints( | ||
| bond_constraints: Optional[Dict[str, int]] = None) -> None: | ||
| """Configures the semantic constraints of :mod:`selfies`. | ||
| The SELFIES grammar is enforced dynamically from a dictionary | ||
| ``bond_constraints``. The keys of the dictionary are atoms and/or ions | ||
| (e.g. ``I``, ``Fe+2``). To denote an ion, use the format ``E+C`` | ||
| or ``E-C``, where ``E`` is an element and ``C`` is a positive integer. | ||
| The corresponding value is the maximum number of bonds that atom or | ||
| ion can make, between 1 and 8 inclusive. For example, one may have: | ||
| * ``bond_constraints['I'] = 1`` | ||
| * ``bond_constraints['C'] = 4`` | ||
| :func:`selfies.decoder` will only generate SMILES that respect the bond | ||
| constraints specified by the dictionary. In the example above, both | ||
| ``'[C][=I]'`` and ``'[I][=C]'`` will be translated to ``'CI'`` and | ||
| ``'IC'`` respectively, because ``I`` has been configured to make one bond | ||
| maximally. | ||
| If an atom or ion is not specified in ``bond_constraints``, it will | ||
| by default be constrained to 8 bonds. To change the default setting | ||
| for unrecognized atoms or ions, set ``bond_constraints['?']`` to the | ||
| desired integer (between 1 and 8 inclusive). | ||
| :param bond_constraints: a dictionary representing the semantic | ||
| constraints the updated SELFIES will operate upon. Defaults to | ||
| ``None``; in this case, a default dictionary will be used. | ||
| :return: ``None``. | ||
| """ | ||
| global _bond_constraints | ||
| if bond_constraints is None: | ||
| _bond_constraints = default_bond_constraints | ||
| else: | ||
| # error checking | ||
| if '?' not in bond_constraints: | ||
| raise ValueError("bond_constraints missing '?' as a key.") | ||
| for key, value in bond_constraints.items(): | ||
| if not (1 <= value <= 8): | ||
| raise ValueError("bond_constraints['{}'] not between " | ||
| "1 and 8 inclusive.".format(key)) | ||
| _bond_constraints = dict(bond_constraints) | ||
| # Symbol State Dict Functions ============================================== | ||
| def get_next_state(symbol: str, state: int) -> Tuple[str, int]: | ||
| """Enforces the grammar rules for standard SELFIES symbols. | ||
| Given the current non-branch, non-ring symbol and current derivation | ||
| state, retrieves the derived SMILES symbol and the next derivation | ||
| state. | ||
| :param symbol: a SELFIES symbol that is not a Ring or Branch. | ||
| :param state: the current derivation state. | ||
| :return: a tuple of (1) the derived symbol, and | ||
| (2) the next derivation state. | ||
| """ | ||
| if symbol == '[epsilon]': | ||
| return ('', 0) if state == 0 else ('', -1) | ||
| # convert to smiles symbol | ||
| bond = '' | ||
| if symbol[1] in {'/', '\\', '=', '#'}: | ||
| bond = symbol[1] | ||
| bond_num = get_num_from_bond(bond) | ||
| if symbol[-5:] == 'expl]': # e.g. [C@@Hexpl] | ||
| smiles_symbol = "[{}]".format(symbol[1 + len(bond):-5]) | ||
| else: | ||
| smiles_symbol = symbol[1 + len(bond):-1] | ||
| # get bond capacity | ||
| element, h_count, charge = parse_atom_symbol(smiles_symbol) | ||
| if charge == 0: | ||
| atom_or_ion = element | ||
| else: | ||
| atom_or_ion = "{}{:+}".format(element, charge) | ||
| max_bonds = _bond_constraints.get(atom_or_ion, | ||
| _bond_constraints['?']) | ||
| if (h_count > max_bonds) or (h_count == max_bonds and state > 0): | ||
| raise ValueError("too many Hs in symbol '{}'; consider " | ||
| "adjusting bond constraints".format(symbol)) | ||
| max_bonds -= h_count # hydrogens consume 1 bond | ||
| # calculate next state | ||
| def next_atom_state( | ||
| bond_order: int, bond_cap: int, state: int | ||
| ) -> Tuple[int, Optional[int]]: | ||
| if state == 0: | ||
| bond = '' | ||
| next_state = max_bonds | ||
| else: | ||
| if bond_num > min(state, max_bonds): | ||
| bond_num = min(state, max_bonds) | ||
| bond = get_bond_from_num(bond_num) | ||
| bond_order = 0 | ||
| next_state = max_bonds - bond_num | ||
| if next_state == 0: | ||
| next_state = -1 | ||
| bond_order = min(bond_order, state, bond_cap) | ||
| bonds_left = bond_cap - bond_order | ||
| next_state = None if (bonds_left == 0) else bonds_left | ||
| return bond_order, next_state | ||
| return (bond + smiles_symbol), next_state | ||
| def next_branch_state( | ||
| branch_type: int, state: int | ||
| ) -> Tuple[int, Optional[int]]: | ||
| assert 1 <= branch_type <= 3 | ||
| assert state > 1 | ||
| # Branch State Dict Functions ================================================= | ||
| branch_init_state = min(state - 1, branch_type) | ||
| next_state = state - branch_init_state | ||
| return branch_init_state, next_state | ||
| def get_next_branch_state(branch_symbol: str, state: int) -> Tuple[int, int]: | ||
| """Enforces the grammar rules for SELFIES Branch symbols. | ||
| def next_ring_state( | ||
| ring_type: int, state: int | ||
| ) -> Tuple[int, Optional[int]]: | ||
| assert state > 0 | ||
| Given the branch symbol and current derivation state, retrieves | ||
| the initial branch derivation state (i.e. the derivation state that the | ||
| new branch begins on), and the next derivation state (i.e. the derivation | ||
| state after the branch is created). | ||
| bond_order = min(ring_type, state) | ||
| bonds_left = state - bond_order | ||
| next_state = None if (bonds_left == 0) else bonds_left | ||
| return bond_order, next_state | ||
| :param branch_symbol: the branch symbol (e.g. [Branch1_2], [Branch3_1]) | ||
| :param state: the current derivation state. | ||
| :return: a tuple of (1) the initial branch state, and | ||
| (2) the next derivation state. | ||
| """ | ||
| branch_type = int(branch_symbol[-2]) # branches of the form [BranchL_X] | ||
| if not (1 <= branch_type <= 3): | ||
| raise ValueError("unknown branch symbol '{}'".format(branch_symbol)) | ||
| if 2 <= state <= 8: | ||
| branch_init_state = min(state - 1, branch_type) | ||
| next_state = state - branch_init_state | ||
| return branch_init_state, next_state | ||
| else: | ||
| return -1, state | ||
| # SELFIES Symbol to N Functions ============================================ | ||
| _index_alphabet = ['[C]', '[Ring1]', '[Ring2]', | ||
| '[Branch1_1]', '[Branch1_2]', '[Branch1_3]', | ||
| '[Branch2_1]', '[Branch2_2]', '[Branch2_3]', | ||
| '[O]', '[N]', '[=N]', '[=C]', '[#C]', '[S]', '[P]'] | ||
| # _alphabet_code takes as a key a SELFIES symbol, and its corresponding value | ||
| # is the index of the key. | ||
| _alphabet_code = {c: i for i, c in enumerate(_index_alphabet)} | ||
| def get_n_from_symbols(*symbols: List[str]) -> int: | ||
| """Computes N from a list of SELFIES symbols. | ||
| Converts a list of SELFIES symbols [c_1, ..., c_n] into a number N. | ||
| This is done by converting each symbol c_n to an integer idx(c_n) via | ||
| ``_alphabet_code``, and then treating the list as a number in base | ||
| len(_alphabet_code). If a symbol is unrecognized, it is given value 0 by | ||
| default. | ||
| :param symbols: a list of SELFIES symbols. | ||
| :return: the corresponding N for ``symbols``. | ||
| """ | ||
| N = 0 | ||
| def get_index_from_selfies(*symbols: List[str]) -> int: | ||
| index = 0 | ||
| for i, c in enumerate(reversed(symbols)): | ||
| N_i = _alphabet_code.get(c, 0) * (len(_alphabet_code) ** i) | ||
| N += N_i | ||
| return N | ||
| index += INDEX_CODE.get(c, 0) * (len(INDEX_CODE) ** i) | ||
| return index | ||
| def get_symbols_from_n(n: int) -> List[str]: | ||
| """Converts an integer n into a list of SELFIES symbols that, if | ||
| passed into ``get_n_from_symbols`` in that order, would have produced n. | ||
| def get_selfies_from_index(index: int) -> List[str]: | ||
| if index < 0: | ||
| raise IndexError() | ||
| elif index == 0: | ||
| return [INDEX_ALPHABET[0]] | ||
| :param n: an integer from 0 to 4095 inclusive. | ||
| :return: a list of SELFIES symbols representing n in base | ||
| ``len(_alphabet_code)``. | ||
| """ | ||
| if n == 0: | ||
| return [_index_alphabet[0]] | ||
| symbols = [] | ||
| base = len(_index_alphabet) | ||
| while n: | ||
| symbols.append(_index_alphabet[n % base]) | ||
| n //= base | ||
| base = len(INDEX_ALPHABET) | ||
| while index: | ||
| symbols.append(INDEX_ALPHABET[index % base]) | ||
| index //= base | ||
| return symbols[::-1] | ||
| # Helper Functions ============================================================ | ||
| # ============================================================================= | ||
| # Caches (for computational speed) | ||
| # ============================================================================= | ||
| def get_num_from_bond(bond_symbol: str) -> int: | ||
| """Retrieves the bond multiplicity from a SMILES symbol representing | ||
| a bond. If ``bond_symbol`` is not known, 1 is returned by default. | ||
| SELFIES_ATOM_PATTERN = re.compile( | ||
| r"^[\[]" # opening square bracket [ | ||
| r"([=#/\\]?)" # bond char | ||
| r"(\d*)" # isotope number (optional, e.g. 123, 26) | ||
| r"([A-Z][a-z]?)" # element symbol | ||
| r"([@]{0,2})" # chiral_tag (optional, only @ and @@ supported) | ||
| r"((?:[H]\d)?)" # H count (optional, e.g. H1, H3) | ||
| r"((?:[+-][1-9]+)?)" # charge (optional, e.g. +1) | ||
| r"[]]$" # closing square bracket ] | ||
| ) | ||
| :param bond_symbol: a SMILES symbol representing a bond. | ||
| :return: the bond multiplicity of ``bond_symbol``, or 1 if | ||
| ``bond_symbol`` is not recognized. | ||
| """ | ||
| if bond_symbol == "=": | ||
| return 2 | ||
| elif bond_symbol == "#": | ||
| return 3 | ||
| else: | ||
| return 1 | ||
| def _process_atom_selfies_no_cache(symbol): | ||
| m = SELFIES_ATOM_PATTERN.match(symbol) | ||
| if m is None: | ||
| return None | ||
| bond_char, isotope, element, chirality, h_count, charge = m.groups() | ||
| if symbol[1 + len(bond_char):-1] in ORGANIC_SUBSET: | ||
| atom_fac = functools.partial(Atom, element=element, is_aromatic=False) | ||
| return smiles_to_bond(bond_char), atom_fac | ||
| def get_bond_from_num(n: int) -> str: | ||
| """Returns the SMILES symbol representing a bond with multiplicity | ||
| ``n``. More specifically, ``'' = 1`` and ``'=' = 2`` and ``'#' = 3``. | ||
| isotope = None if (isotope == "") else int(isotope) | ||
| if element not in ELEMENTS: | ||
| return None | ||
| chirality = None if (chirality == "") else chirality | ||
| :param n: either 1, 2, 3. | ||
| :return: the SMILES symbol representing a bond with multiplicity ``n``. | ||
| """ | ||
| s = h_count | ||
| if s == "": | ||
| h_count = 0 | ||
| else: | ||
| h_count = int(s[1:]) | ||
| return ('', '=', '#')[n - 1] | ||
| s = charge | ||
| if s == "": | ||
| charge = 0 | ||
| else: | ||
| charge = int(s[1:]) | ||
| charge *= 1 if (s[0] == "+") else -1 | ||
| atom_fac = functools.partial( | ||
| Atom, | ||
| element=element, | ||
| is_aromatic=False, | ||
| isotope=isotope, | ||
| chirality=chirality, | ||
| h_count=h_count, | ||
| charge=charge | ||
| ) | ||
| def find_element(atom_symbol: str) -> Tuple[int, int]: | ||
| """Returns the indices of the element component of a SMILES atom symbol. | ||
| return smiles_to_bond(bond_char), atom_fac | ||
| That is, if atom_symbol[i:j] is the element substring of the SMILES atom, | ||
| then (i, j) is returned. For example: | ||
| * _find_element('b') = (0, 1). | ||
| * _find_element('B') = (0, 1). | ||
| * _find_element('[13C]') = (3, 4). | ||
| * _find_element('[nH+]') = (1, 2). | ||
| :param atom_symbol: a SMILES atom. | ||
| :return: a tuple of the indices of the element substring of | ||
| ``atom_symbol``. | ||
| """ | ||
| def _build_atom_cache(): | ||
| cache = dict() | ||
| common_symbols = [ | ||
| "[#C+1]", "[#C-1]", "[#C]", "[#N+1]", "[#N]", "[#O+1]", "[#P+1]", | ||
| "[#P-1]", "[#P]", "[#S+1]", "[#S-1]", "[#S]", "[=C+1]", "[=C-1]", | ||
| "[=C]", "[=N+1]", "[=N-1]", "[=N]", "[=O+1]", "[=O]", "[=P+1]", | ||
| "[=P-1]", "[=P]", "[=S+1]", "[=S-1]", "[=S]", "[Br]", "[C+1]", "[C-1]", | ||
| "[C]", "[Cl]", "[F]", "[H]", "[I]", "[N+1]", "[N-1]", "[N]", "[O+1]", | ||
| "[O-1]", "[O]", "[P+1]", "[P-1]", "[P]", "[S+1]", "[S-1]", "[S]" | ||
| ] | ||
| if atom_symbol[0] != '[': | ||
| return 0, len(atom_symbol) | ||
| for symbol in common_symbols: | ||
| cache[symbol] = _process_atom_selfies_no_cache(symbol) | ||
| return cache | ||
| i = 1 | ||
| while atom_symbol[i].isdigit(): # skip isotope number | ||
| i += 1 | ||
| if atom_symbol[i + 1].isalpha() and atom_symbol[i + 1] != 'H': | ||
| return i, i + 2 | ||
| else: | ||
| return i, i + 1 | ||
| def _build_branch_cache(): | ||
| cache = dict() | ||
| for L in range(1, 4): | ||
| for bond_char in ["", "=", "#"]: | ||
| symbol = "[{}Branch{}]".format(bond_char, L) | ||
| cache[symbol] = (smiles_to_bond(bond_char)[0], L) | ||
| return cache | ||
| def parse_atom_symbol(atom_symbol: str) -> Tuple[str, int, int]: | ||
| """Parses a SMILES atom symbol and returns its element component, | ||
| number of associated hydrogens, and charge. | ||
| def _build_ring_cache(): | ||
| cache = dict() | ||
| for L in range(1, 4): | ||
| # [RingL], [=RingL], [#RingL] | ||
| for bond_char in ["", "=", "#"]: | ||
| symbol = "[{}Ring{}]".format(bond_char, L) | ||
| order, stereo = smiles_to_bond(bond_char) | ||
| cache[symbol] = (order, L, (stereo, stereo)) | ||
| See http://opensmiles.org/opensmiles.html for the formal grammar | ||
| of SMILES atom symbols. Note that only @ and @@ are currently supported | ||
| as chiral specifications. | ||
| # [-/RingL], [\/RingL], [\-RingL], ... | ||
| for lchar, rchar in itertools.product(["-", "/", "\\"], repeat=2): | ||
| if lchar == rchar == "-": | ||
| continue | ||
| symbol = "[{}{}Ring{}]".format(lchar, rchar, L) | ||
| order, lstereo = smiles_to_bond(lchar) | ||
| order, rstereo = smiles_to_bond(rchar) | ||
| cache[symbol] = (order, L, (lstereo, rstereo)) | ||
| return cache | ||
| :param atom_symbol: a SMILES atom symbol. | ||
| :return: a tuple of (1) the element of ``atom_symbol``, (2) the hydrogen | ||
| count, and (3) the charge. | ||
| """ | ||
| if atom_symbol[0] != '[': | ||
| return atom_symbol, 0, 0 | ||
| _PROCESS_ATOM_CACHE = _build_atom_cache() | ||
| atom_start, atom_end = find_element(atom_symbol) | ||
| i = atom_end | ||
| _PROCESS_BRANCH_CACHE = _build_branch_cache() | ||
| # skip chirality | ||
| if atom_symbol[i] == '@': # e.g. @ | ||
| i += 1 | ||
| if atom_symbol[i] == '@': # e.g. @@ | ||
| i += 1 | ||
| h_count = 0 # hydrogen count | ||
| if atom_symbol[i] == 'H': | ||
| h_count = 1 | ||
| i += 1 | ||
| if atom_symbol[i].isdigit(): # e.g. [CH2] | ||
| h_count = int(atom_symbol[i]) | ||
| i += 1 | ||
| charge = 0 # charge count | ||
| if atom_symbol[i] in ('+', '-'): | ||
| charge = 1 if atom_symbol[i] == '+' else -1 | ||
| i += 1 | ||
| if atom_symbol[i] in ('+', '-'): # e.g. [Cu++] | ||
| while atom_symbol[i] in ('+', '-'): | ||
| charge += (1 if atom_symbol[i] == '+' else -1) | ||
| i += 1 | ||
| elif atom_symbol[i].isdigit(): # e.g. [Cu+2] | ||
| s = i | ||
| while atom_symbol[i].isdigit(): | ||
| i += 1 | ||
| charge *= int(atom_symbol[s:i]) | ||
| return atom_symbol[atom_start: atom_end], h_count, charge | ||
| _PROCESS_RING_CACHE = _build_ring_cache() |
+2
-2
@@ -10,4 +10,4 @@ #!/usr/bin/env python | ||
| name="selfies", | ||
| version="1.0.4", | ||
| author="Mario Krenn", | ||
| version="2.0.0", | ||
| author="Mario Krenn, Alston Lo, and many other contributors", | ||
| author_email="mario.krenn@utoronto.ca, alan@aspuru.com", | ||
@@ -14,0 +14,0 @@ description="SELFIES (SELF-referencIng Embedded Strings) is a " |
| from typing import Dict, Iterable, List, Set, Tuple, Union | ||
| from selfies.grammar_rules import find_element, get_num_from_bond, \ | ||
| parse_atom_symbol | ||
| ATOM_TYPE = 1 | ||
| BRANCH_TYPE = 2 | ||
| RING_TYPE = 3 | ||
| def kekulize_parser(smiles_gen: Iterable[Tuple[str, str, int]]) \ | ||
| -> Iterable[Tuple[str, str, int]]: | ||
| """Kekulizes a SMILES in the form of an iterable. | ||
| This method intercepts the output of ``encoder._parse_smiles``, and | ||
| acts as filter that kekulizes the SMILES. The motivation for having | ||
| this setup is that string parsing and concatenation is minimized, | ||
| as the parsing is already done by ``_parse_smiles``. | ||
| Reference: https://depth-first.com/articles/2020/02/10/a-comprehensive | ||
| -treatment-of-aromaticity-in-the-smiles-language/ | ||
| :param smiles_gen: an iterator returned by ``encoder._parse_smiles``. | ||
| :return: an iterator representing the kekulized SMILES, in the same | ||
| format as that returned by ``encoder._parse_smiles``. | ||
| """ | ||
| # save to list, so the iterator can be used across multiple functions | ||
| # change elements from tuple -> list to allow in-place modifications | ||
| smiles_symbols = list(map(list, smiles_gen)) | ||
| mol_graph = MolecularGraph(smiles_symbols) | ||
| rings = {} | ||
| _build_molecular_graph(mol_graph, smiles_symbols, rings) | ||
| if mol_graph.aro_indices: | ||
| _kekulize(mol_graph) | ||
| for x in mol_graph.smiles_symbols: # return as iterator | ||
| yield tuple(x) | ||
| def _build_molecular_graph(graph, | ||
| smiles_symbols: List[List[Union[str, int]]], | ||
| rings: Dict[int, Tuple[int, int]], | ||
| prev_idx: int = -1, | ||
| curr_idx: int = -1) -> int: | ||
| """From the iterator returned by ``encoder._parse_smiles``, builds | ||
| a graph representation of the molecule. | ||
| This is done by iterating through ``smiles_symbols``, and then adding bonds | ||
| to the molecular graph. Note that ``smiles_symbols`` is mutated in this | ||
| method, for convenience. | ||
| :param graph: the MolecularGraph to be added to. | ||
| :param smiles_symbols: a list created from the iterator returned | ||
| by ``encoder._parse_smiles``. | ||
| :param rings: an, initially, empty dictionary used to keep track of | ||
| rings to be made. | ||
| :param prev_idx: | ||
| :param curr_idx: | ||
| :return: the last index of ``smiles_symbols`` that was processed. | ||
| """ | ||
| while curr_idx + 1 < len(smiles_symbols): | ||
| curr_idx += 1 | ||
| _, symbol, symbol_type = smiles_symbols[curr_idx] | ||
| if symbol_type == ATOM_TYPE: | ||
| if prev_idx >= 0: | ||
| graph.add_bond(prev_idx, curr_idx, curr_idx) | ||
| prev_idx = curr_idx | ||
| elif symbol_type == BRANCH_TYPE: | ||
| if symbol == '(': | ||
| curr_idx = _build_molecular_graph(graph, smiles_symbols, rings, | ||
| prev_idx, curr_idx) | ||
| else: | ||
| break | ||
| else: | ||
| if symbol in rings: | ||
| left_idx, left_bond_idx = rings.pop(symbol) | ||
| right_idx, right_bond_idx = prev_idx, curr_idx | ||
| # we mutate one bond index to be '', so that we | ||
| # can faithfully represent the bond to be localized at | ||
| # one index. For example, C=1CCCC=1 --> C1CCCC=1. | ||
| if smiles_symbols[left_bond_idx][0] != '': | ||
| bond_idx = left_bond_idx | ||
| smiles_symbols[right_bond_idx][0] = '' | ||
| else: | ||
| bond_idx = right_bond_idx | ||
| smiles_symbols[left_bond_idx][0] = '' | ||
| graph.add_bond(left_idx, right_idx, bond_idx) | ||
| else: | ||
| rings[symbol] = (prev_idx, curr_idx) | ||
| return curr_idx | ||
| def _kekulize(mol_graph) -> None: | ||
| """Kekulizes the molecular graph. | ||
| :param mol_graph: a molecular graph to be kekulized. | ||
| :return: None. | ||
| """ | ||
| mol_graph.prune_to_pi_subgraph() | ||
| visited = set() | ||
| for i in mol_graph.get_nodes_by_num_edges(): | ||
| success = mol_graph.dfs_assign_bonds(i, visited, set(), set()) | ||
| if not success: | ||
| raise ValueError("kekulization algorithm failed") | ||
| mol_graph.write_to_smiles_symbols() | ||
| # Aromatic Helper Methods and Classes | ||
| # key = aromatic SMILES element, value = number of valence electrons | ||
| # Note: wild card '*' not supported currently | ||
| _aromatic_valences = { | ||
| 'b': 3, 'al': 3, 'c': 4, 'si': 4, 'n': 5, 'p': 5, | ||
| 'as': 5, 'o': 6, 's': 6, 'se': 6, 'te': 6 | ||
| } | ||
| def _capitalize(atom_symbol: str) -> str: | ||
| """Capitalizes the element portion of an aromatic SMILES atom symbol, | ||
| converting it into a standard SMILES atom symbol. | ||
| :param atom_symbol: an aromatic SMILES atom symbol. | ||
| :return: the capitalized ``atom_symbol``. | ||
| """ | ||
| s, _ = find_element(atom_symbol) | ||
| return atom_symbol[:s] + atom_symbol[s].upper() + atom_symbol[s + 1:] | ||
| def _is_aromatic(atom_symbol: str) -> bool: | ||
| """Checks whether a SMILES atom symbol is an aromatic SMILES atom symbol. | ||
| An aromatic SMILES atom symbol is indicated by an element substring | ||
| that is not capitalized. | ||
| :param atom_symbol: a SMILES atom symbol. | ||
| :return: True, if ``atom_symbol`` is an aromatic atom symbol, | ||
| and False otherwise. | ||
| """ | ||
| s, e = find_element(atom_symbol) | ||
| if e == len(atom_symbol): # optimization to prevent string copying | ||
| element = atom_symbol | ||
| else: | ||
| element = atom_symbol[s: e] | ||
| if element[0].isupper(): # check if element is capitalized | ||
| return False | ||
| if element not in _aromatic_valences: | ||
| raise ValueError("unrecognized aromatic symbol '{}'" | ||
| .format(atom_symbol)) | ||
| return True | ||
| def _in_pi_subgraph(atom_symbol: str, bonds: Tuple[str]) -> bool: | ||
| """Checks whether a SMILES atom symbol should be a node in the pi | ||
| subgraph, based on its bonds. | ||
| More specifically, an atom should be a node in the pi subgraph if it has | ||
| an unpaired valence electron, and thus, is able to make a double bond. | ||
| Reference: https://depth-first.com/articles/2020/02/10/a-comprehensive | ||
| -treatment-of-aromaticity-in-the-smiles-language/ | ||
| :param atom_symbol: a SMILES atom symbol representing an atom. | ||
| :param bonds: the bonds connected to ``atom_symbol``. | ||
| :return: True if ``atom_symbol`` should be included in the pi subgraph, | ||
| and False otherwise. | ||
| """ | ||
| atom, h_count, charge = parse_atom_symbol(atom_symbol) | ||
| used_electrons = 0 | ||
| for b in bonds: | ||
| used_electrons += get_num_from_bond(b) | ||
| # e.g. c1ccccc1 | ||
| # this also covers the neutral carbon radical case (e.g. C1=[C]NC=C1), | ||
| # which is treated equivalently to a 1-H carbon (e.g. C1=[CH]NC=C1) | ||
| if (atom == 'c') and (h_count == charge == 0) \ | ||
| and (len(bonds) == 2) and ('#' not in bonds): | ||
| h_count += 1 # implied bonded hydrogen | ||
| if h_count > 1: | ||
| raise ValueError("unrecognized aromatic symbol '{}'" | ||
| .format(atom_symbol)) | ||
| elif h_count == 1: # e.g. [nH] | ||
| used_electrons += 1 | ||
| valence = _aromatic_valences[atom] - charge | ||
| free_electrons = valence - used_electrons | ||
| return free_electrons % 2 != 0 | ||
| class MolecularGraph: | ||
| """A molecular graph. | ||
| This molecular graph operates based on the ``smiles_symbols`` data | ||
| structure. Indices from this list represent nodes or edges, depending | ||
| on whether they point to a SMILES atom(s) or bond. | ||
| :ivar smiles_symbols: the list created from the iterator returned by | ||
| ``encoder._parse_smiles``. Serves as the base data structure | ||
| of this class, as everything is communicated through indices | ||
| referring to elements of this list. | ||
| :ivar graph: the key is an index of the atom(s) from ``smiles_symbols``. | ||
| The value is a list of Bond objects representing the connected | ||
| bonds. Represents the actual molecular graph. | ||
| :ivar aro_indices: a set of indices of atom(s) from ``smiles_symbols`` | ||
| that are aromatic in the molecular graph. | ||
| """ | ||
| def __init__(self, smiles_symbols: List[List[Union[str, int]]]): | ||
| self.smiles_symbols = smiles_symbols | ||
| self.graph = {} | ||
| self.aro_indices = set() | ||
| def get_atom_symbol(self, idx: int) -> str: | ||
| """Getter that returns the SMILES symbol representing an atom | ||
| at a specified index. | ||
| :param idx: an index in ``smiles_symbols``. | ||
| :return: the SMILES symbol representing an atom at index | ||
| ``idx`` in ``smiles_symbols``. | ||
| """ | ||
| return self.smiles_symbols[idx][1] | ||
| def get_bond_symbol(self, idx: int) -> str: | ||
| """Getter that returns the SMILES symbol representing a bond at | ||
| a specified index. | ||
| :param idx: an index in ``smiles_symbols``. | ||
| :return: the SMILES symbol representing a bond at index | ||
| ``idx`` in ``smiles_symbols``. | ||
| """ | ||
| return self.smiles_symbols[idx][0] | ||
| def get_nodes_by_num_edges(self) -> List[int]: | ||
| """Returns all nodes (or indices) stored in this molecular graph | ||
| in a semi-sorted order by number of edges. | ||
| This is to optimize the speed of ``dfs_assign_bonds``; starting | ||
| with nodes that have fewer edges will improve computational time | ||
| as there are fewer bond configurations to explore. Instead of fully | ||
| sorting the returned list, a compromise is made, and nodes with exactly | ||
| one edge are added to the list's beginning. | ||
| :return: a list of the nodes (or indices) of this molecular graph, | ||
| semi-sorted by number of edges. | ||
| """ | ||
| ends = [] # nodes with exactly 1 edge | ||
| middles = [] # nodes with 2+ edges | ||
| for idx, edges in self.graph.items(): | ||
| if len(edges) > 1: | ||
| middles.append(idx) | ||
| else: | ||
| ends.append(idx) | ||
| ends.extend(middles) | ||
| return ends | ||
| def set_atom_symbol(self, atom_symbol: str, idx: int) -> None: | ||
| """Setter that updates the SMILES symbol representing an atom(s) at | ||
| a specified index. | ||
| :param atom_symbol: the new value of the atom symbol at ``idx``. | ||
| :param idx: an index in ``smiles_symbols``. | ||
| :return: None. | ||
| """ | ||
| self.smiles_symbols[idx][1] = atom_symbol | ||
| def set_bond_symbol(self, bond_symbol: str, idx: int) -> None: | ||
| """Setter that updates the SMILES symbol representing a bond at | ||
| a specified index. | ||
| :param bond_symbol: the new value of the bond symbol at ``idx``. | ||
| :param idx: an index in ``smiles_symbols``. | ||
| :return: None. | ||
| """ | ||
| self.smiles_symbols[idx][0] = bond_symbol | ||
| def add_bond(self, idx_a: int, idx_b: int, bond_idx: int) -> None: | ||
| """Adds a bond (or edge) to this molecular graph between atoms | ||
| (or nodes) at two specified indices. | ||
| :param idx_a: the index of one atom (or node) of this bond. | ||
| :param idx_b:the index of one atom (or node) of this bond. | ||
| :param bond_idx: the index of this bond. | ||
| :return: None. | ||
| """ | ||
| atom_a = self.get_atom_symbol(idx_a) | ||
| atom_b = self.get_atom_symbol(idx_b) | ||
| atom_a_aro = (idx_a in self.aro_indices) or _is_aromatic(atom_a) | ||
| atom_b_aro = (idx_b in self.aro_indices) or _is_aromatic(atom_b) | ||
| bond_symbol = self.get_bond_symbol(bond_idx) | ||
| if atom_a_aro: | ||
| self.aro_indices.add(idx_a) | ||
| if atom_b_aro: | ||
| self.aro_indices.add(idx_b) | ||
| if bond_symbol == ':': | ||
| self.aro_indices.add(idx_a) | ||
| self.aro_indices.add(idx_b) | ||
| # Note: ':' bonds are edited here to '' | ||
| self.set_bond_symbol('', bond_idx) | ||
| bond_symbol = '' | ||
| edge = Bond(idx_a, idx_b, bond_symbol, bond_idx) | ||
| self.graph.setdefault(idx_a, []).append(edge) | ||
| self.graph.setdefault(idx_b, []).append(edge) | ||
| def prune_to_pi_subgraph(self) -> None: | ||
| """Removes nodes and edges from this molecular graph such that | ||
| it becomes the pi subgraph. | ||
| The remaining graph will only contain aromatic atoms (or nodes) | ||
| that belong in the pi-subgraph, and the bonds that are aromatic | ||
| and between such atoms. | ||
| :return: None. | ||
| """ | ||
| # remove non-aromatic nodes | ||
| non_aromatic = self.graph.keys() - self.aro_indices | ||
| for i in non_aromatic: | ||
| self.graph.pop(i) | ||
| # remove non-pi subgraph nodes | ||
| for i in self.aro_indices: | ||
| atom = self.get_atom_symbol(i) | ||
| bonds = tuple(edge.bond_symbol for edge in self.graph[i]) | ||
| if not _in_pi_subgraph(atom, bonds): | ||
| self.graph.pop(i) | ||
| # remove irrelevant edges | ||
| for idx, edges in self.graph.items(): | ||
| keep = list(filter(lambda e: (e.idx_a in self.graph) | ||
| and (e.idx_b in self.graph) | ||
| and (e.bond_symbol == ''), | ||
| edges)) | ||
| self.graph[idx] = keep | ||
| def dfs_assign_bonds(self, idx: int, | ||
| visited: Set[int], | ||
| matched_nodes: Set[int], | ||
| matched_edges) -> bool: | ||
| """After calling ``prune_to_pi_subgraph``, this method assigns | ||
| double bonds between pairs of nodes such that every node is | ||
| paired or matched. | ||
| This is done recursively in a depth-first search fashion. | ||
| :param idx: the index of the current atom (or node). | ||
| :param visited: a set of the indices of nodes that have been visited. | ||
| :param matched_nodes: a set of the indices of nodes that have been | ||
| matched, i.e., assigned a double bond. | ||
| :param matched_edges: a set of the bonds that have been matched. | ||
| :return: True, if a valid bond assignment was found; False otherwise. | ||
| """ | ||
| if idx in visited: | ||
| return True | ||
| edges = self.graph[idx] | ||
| if idx in matched_nodes: | ||
| # recursively try to match adjacent nodes. If the matching | ||
| # fails, then we must backtrack. | ||
| visited_save = visited.copy() | ||
| visited.add(idx) | ||
| for e in edges: | ||
| adj = e.other_end(idx) | ||
| if not self.dfs_assign_bonds(adj, visited, | ||
| matched_nodes, | ||
| matched_edges): | ||
| visited &= visited_save | ||
| return False | ||
| return True | ||
| else: | ||
| # list of candidate edges that can become a double bond | ||
| candidates = list( | ||
| filter(lambda i: i.other_end(idx) not in matched_nodes, edges) | ||
| ) | ||
| if not candidates: | ||
| return False # idx is unmatched, but all adj nodes are matched | ||
| matched_edges_save = matched_edges.copy() | ||
| for e in candidates: | ||
| # match nodes connected by c | ||
| matched_nodes.add(e.idx_a) | ||
| matched_nodes.add(e.idx_b) | ||
| matched_edges.add(e) | ||
| success = self.dfs_assign_bonds(idx, visited, | ||
| matched_nodes, | ||
| matched_edges) | ||
| if success: | ||
| e.bond_symbol = '=' | ||
| return True | ||
| else: # the matching failed, so we must backtrack | ||
| for edge in matched_edges - matched_edges_save: | ||
| edge.bond_symbol = '' | ||
| matched_nodes.discard(edge.idx_a) | ||
| matched_nodes.discard(edge.idx_b) | ||
| matched_edges &= matched_edges_save | ||
| return False | ||
| def write_to_smiles_symbols(self): | ||
| """Updates and mutates ``self.smiles_symbols`` with the information | ||
| contained in ``self.graph``. | ||
| After kekulizing the molecular graph, this method is called to | ||
| merge the new information back into the original data structure. | ||
| :return: None. | ||
| """ | ||
| # capitalize aromatic molecules | ||
| for idx in self.aro_indices: | ||
| self.set_atom_symbol(_capitalize(self.get_atom_symbol(idx)), idx) | ||
| # write bonds | ||
| for edge_list in self.graph.values(): | ||
| for edge in edge_list: | ||
| bond_symbol = edge.bond_symbol | ||
| bond_idx = edge.bond_idx | ||
| self.set_bond_symbol(bond_symbol, bond_idx) | ||
| # branches record the next symbol as their bond, so we | ||
| # must update accordingly | ||
| if (bond_idx > 0) and \ | ||
| (self.smiles_symbols[bond_idx - 1][2] == BRANCH_TYPE): | ||
| self.set_bond_symbol(bond_symbol, bond_idx - 1) | ||
| class Bond: | ||
| """Represents a bond or edge in MolecularGraph. | ||
| Recall that the following indices are with respect to ``smiles_symbols`` | ||
| in MolecularGraph. | ||
| :ivar idx_a: the index of one atom or node of this bond. | ||
| :ivar idx_b: the index of one atom or node of this bond. | ||
| :ivar bond_symbol: the SMILES symbol representing this bond (e.g. '#'). | ||
| :ivar bond_idx: the index of this bond or edge. | ||
| """ | ||
| def __init__(self, idx_a, idx_b, bond_symbol, bond_idx): | ||
| self.idx_a = idx_a | ||
| self.idx_b = idx_b | ||
| self.bond_symbol = bond_symbol | ||
| self.bond_idx = bond_idx | ||
| def __eq__(self, other): | ||
| if isinstance(other, type(self)): | ||
| return (self.idx_a, self.idx_b) == (other.idx_a, other.idx_b) | ||
| return NotImplemented | ||
| def __hash__(self): | ||
| return hash((self.idx_a, self.idx_b)) | ||
| def other_end(self, idx): | ||
| """Given an index representing one end of this bond, returns | ||
| the index representing the other end. | ||
| :param idx: an index of one atom or node of this bond. | ||
| :return: the index of the other atom or node of this bond, or | ||
| None if ``idx`` is an invalid input. | ||
| """ | ||
| if idx == self.idx_a: | ||
| return self.idx_b | ||
| elif idx == self.idx_b: | ||
| return self.idx_a | ||
| return None |
-288
| from typing import Dict, Iterable, List, Set, Tuple, Union | ||
| def len_selfies(selfies: str) -> int: | ||
| """Computes the symbol length of a SELFIES. | ||
| The symbol length is the number of symbols that make up the SELFIES, | ||
| and not the length of the string itself (i.e. ``len(selfies)``). | ||
| :param selfies: a SELFIES. | ||
| :return: the symbol length of ``selfies``. | ||
| :Example: | ||
| >>> import selfies | ||
| >>> selfies.len_selfies('[C][O][C]') | ||
| 3 | ||
| >>> selfies.len_selfies('[C][=C][F].[C]') | ||
| 5 | ||
| """ | ||
| return selfies.count("[") + selfies.count(".") | ||
| def split_selfies(selfies: str) -> Iterable[str]: | ||
| """Splits a SELFIES into its symbols. | ||
| Returns an iterable that yields the symbols of a SELFIES one-by-one | ||
| in the order they appear in the string. SELFIES symbols are always | ||
| either indicated by an open and closed square bracket, or are the ``'.'`` | ||
| dot-bond symbol. | ||
| :param selfies: the SELFIES to be read. | ||
| :return: an iterable of the symbols of ``selfies`` in the same order | ||
| they appear in the string. | ||
| :Example: | ||
| >>> import selfies | ||
| >>> list(selfies.split_selfies('[C][O][C]')) | ||
| ['[C]', '[O]', '[C]'] | ||
| >>> list(selfies.split_selfies('[C][=C][F].[C]')) | ||
| ['[C]', '[=C]', '[F]', '.', '[C]'] | ||
| """ | ||
| left_idx = selfies.find("[") | ||
| while 0 <= left_idx < len(selfies): | ||
| right_idx = selfies.find("]", left_idx + 1) | ||
| next_symbol = selfies[left_idx: right_idx + 1] | ||
| yield next_symbol | ||
| left_idx = right_idx + 1 | ||
| if selfies[left_idx: left_idx + 1] == ".": | ||
| yield "." | ||
| left_idx += 1 | ||
| def get_alphabet_from_selfies(selfies_iter: Iterable[str]) -> Set[str]: | ||
| """Constructs an alphabet from an iterable of SELFIES. | ||
| From an iterable of SELFIES, constructs the minimum-sized set | ||
| of SELFIES symbols such that every SELFIES in the iterable can be | ||
| constructed from symbols from that set. Then, the set is returned. | ||
| Note that the symbol ``'.'`` will not be added as a member of the | ||
| returned set, even if it appears in the input. | ||
| :param selfies_iter: an iterable of SELFIES. | ||
| :return: the SElFIES alphabet built from the SELFIES in ``selfies_iter``. | ||
| :Example: | ||
| >>> import selfies | ||
| >>> selfies_list = ['[C][F][O]', '[C].[O]', '[F][F]'] | ||
| >>> alphabet = selfies.get_alphabet_from_selfies(selfies_list) | ||
| >>> sorted(list(alphabet)) | ||
| ['[C]', '[F]', '[O]'] | ||
| """ | ||
| alphabet = set() | ||
| for s in selfies_iter: | ||
| for symbol in split_selfies(s): | ||
| alphabet.add(symbol) | ||
| alphabet.discard(".") | ||
| return alphabet | ||
| def selfies_to_encoding( | ||
| selfies: str, | ||
| vocab_stoi: Dict[str, int], | ||
| pad_to_len: int = -1, | ||
| enc_type: str = 'both' | ||
| ) -> Union[List[int], List[List[int]], Tuple[List[int], List[List[int]]]]: | ||
| """Converts a SELFIES into its label (integer) and/or one-hot encoding. | ||
| A label encoded output will be a list of size ``(N,)`` and a | ||
| one-hot encoded output will be a list of size ``(N, len(vocab_stoi))``; | ||
| where ``N`` is the symbol length of the (potentially padded) SELFIES. | ||
| Note that SELFIES uses the special padding symbol ``[nop]``. | ||
| :param selfies: the SELFIES to be encoded. | ||
| :param vocab_stoi: a dictionary that maps SELFIES symbols (the keys) | ||
| to a non-negative index. The indices of the dictionary | ||
| must contiguous, starting from 0. | ||
| :param pad_to_len: the length the SELFIES is be padded to. | ||
| If ``pad_to_len`` is less than or equal to the symbol | ||
| length of the SELFIES, then no padding is added. Defaults to ``-1``. | ||
| :param enc_type: the type of encoding of the output: | ||
| ``label`` or ``one_hot`` or ``both``. | ||
| If the value is ``both``, then a tuple of the label and one-hot | ||
| encoding are returned (in that order). Defaults to ``both``. | ||
| :return: the label encoded and/or one-hot encoded SELFIES. | ||
| :Example: | ||
| >>> import selfies as sf | ||
| >>> sf.selfies_to_encoding('[C][F]', {'[C]': 0, '[F]': 1}) | ||
| ([0, 1], [[1, 0], [0, 1]]) | ||
| """ | ||
| # some error checking | ||
| if enc_type not in ('label', 'one_hot', 'both'): | ||
| raise ValueError("enc_type must be in ('label', 'one_hot', 'both')") | ||
| # pad with [nop] | ||
| if pad_to_len > len_selfies(selfies): | ||
| selfies += "[nop]" * (pad_to_len - len_selfies(selfies)) | ||
| # integer encode | ||
| char_list = split_selfies(selfies) | ||
| integer_encoded = [vocab_stoi[char] for char in char_list] | ||
| if enc_type == 'label': | ||
| return integer_encoded | ||
| # one-hot encode | ||
| onehot_encoded = list() | ||
| for index in integer_encoded: | ||
| letter = [0] * len(vocab_stoi) | ||
| letter[index] = 1 | ||
| onehot_encoded.append(letter) | ||
| if enc_type == 'one_hot': | ||
| return onehot_encoded | ||
| return integer_encoded, onehot_encoded | ||
| def encoding_to_selfies( | ||
| encoded: Union[List[int], List[List[int]]], | ||
| vocab_itos: Dict[int, str], | ||
| enc_type: str, | ||
| ) -> str: | ||
| """Converts a label (integer) or one-hot encoded list into | ||
| a SELFIES string. | ||
| If the input is label encoded, then a list of size ``(N,)`` is | ||
| expected; and if the input is one-hot encoded, then a 2D list of | ||
| size ``(N, len(vocab_itos))`` is expected. | ||
| :param encoded: a label or one-hot encoded list. | ||
| :param vocab_itos: a dictionary that maps non-negative indices (the keys) | ||
| to SELFIES symbols. The indices of the dictionary | ||
| must be contiguous, starting from 0. | ||
| :param enc_type: the type of encoding of the output: | ||
| ``label`` or ``one_hot``. | ||
| :return: the SELFIES string represented by the encoded input. | ||
| :Example: | ||
| >>> import selfies as sf | ||
| >>> one_hot = [[0, 1, 0], [0, 0, 1], [1, 0, 0]] | ||
| >>> vocab_itos = {0: '[nop]', 1: '[C]', 2: '[F]'} | ||
| >>> sf.encoding_to_selfies(one_hot, vocab_itos, enc_type='one_hot') | ||
| '[C][F][nop]' | ||
| """ | ||
| if enc_type not in ('label', 'one_hot'): | ||
| raise ValueError("enc_type must be in ('label', 'one_hot')") | ||
| if enc_type == 'one_hot': # Get integer encoding | ||
| integer_encoded = [] | ||
| for row in encoded: | ||
| integer_encoded.append(row.index(1)) | ||
| else: | ||
| integer_encoded = encoded | ||
| # Integer encoding -> SELFIES | ||
| char_list = [vocab_itos[i] for i in integer_encoded] | ||
| selfies = "".join(char_list) | ||
| return selfies | ||
| def batch_selfies_to_flat_hot( | ||
| selfies_batch: List[str], | ||
| vocab_stoi: Dict[str, int], | ||
| pad_to_len: int = -1, | ||
| ) -> List[List[int]]: | ||
| """Converts a list of SELFIES into a list of | ||
| flattened one-hot encodings. | ||
| Returned is a list of size ``(batch_size, N * len(vocab_stoi))``; | ||
| where ``N`` is the symbol length of the (potentially padded) SELFIES. | ||
| Note that SELFIES uses the special padding symbol ``[nop]``. | ||
| :param selfies_batch: a list of SELFIES to be converted. | ||
| :param vocab_stoi: a dictionary that maps SELFIES symbols (the keys) | ||
| to a non-negative index. The indices of the dictionary | ||
| must contiguous, starting from 0. | ||
| :param pad_to_len: the length that each SELFIES is be padded to. | ||
| If ``pad_to_len`` is less than or equal to the symbol | ||
| length of the SELFIES, then no padding is added. Defaults to ``-1``. | ||
| :return: the flattened one-hot encoded representations of the SELFIES | ||
| from the batch. This is a 2D list of size | ||
| ``(batch_size, N * len(vocab_stoi))``. | ||
| :Example: | ||
| >>> import selfies as sf | ||
| >>> batch = ["[C]", "[C][C]"] | ||
| >>> vocab_stoi = {'[nop]': 0, '[C]': 1} | ||
| >>> sf.batch_selfies_to_flat_hot(batch, vocab_stoi, 2) | ||
| [[0, 1, 1, 0], [0, 1, 0, 1]] | ||
| """ | ||
| hot_list = list() | ||
| for selfies in selfies_batch: | ||
| one_hot = selfies_to_encoding(selfies, vocab_stoi, pad_to_len, | ||
| enc_type='one_hot') | ||
| flattened = [elem for vec in one_hot for elem in vec] | ||
| hot_list.append(flattened) | ||
| return hot_list | ||
| def batch_flat_hot_to_selfies( | ||
| one_hot_batch: List[List[int]], | ||
| vocab_itos: Dict[int, str], | ||
| ) -> List[str]: | ||
| """Convert a batch of flattened one-hot encodings into | ||
| a list of SELFIES. | ||
| We expect ``one_hot_batch`` to be a list of size ``(batch_size, S)``, | ||
| where ``S`` is divisible by the length of the vocabulary. | ||
| :param one_hot_batch: a list of flattened one-hot encoded representations. | ||
| :param vocab_itos: a dictionary that maps non-negative indices (the keys) | ||
| to SELFIES symbols. We expect the indices of the dictionary | ||
| to be contiguous and starting from 0. | ||
| :return: a list of SELFIES strings. | ||
| :Example: | ||
| >>> import selfies as sf | ||
| >>> batch = [[0, 1, 1, 0], [0, 1, 0, 1]] | ||
| >>> vocab_itos = {0: '[nop]', 1: '[C]'} | ||
| >>> sf.batch_flat_hot_to_selfies(batch, vocab_itos) | ||
| ['[C][nop]', '[C][C]'] | ||
| """ | ||
| selfies_list = [] | ||
| for flat_one_hot in one_hot_batch: | ||
| # Reshape to an N x M array where each column represents an alphabet | ||
| # entry and each row is a position in the selfies | ||
| one_hot = [] | ||
| M = len(vocab_itos) | ||
| if len(flat_one_hot) % M != 0: | ||
| raise ValueError("size of vector in one_hot_batch not divisible " | ||
| "by the length of the vocabulary.") | ||
| N = len(flat_one_hot) // M | ||
| for i in range(N): | ||
| one_hot.append(flat_one_hot[M * i: M * (i + 1)]) | ||
| selfies = encoding_to_selfies(one_hot, vocab_itos, enc_type='one_hot') | ||
| selfies_list.append(selfies) | ||
| return selfies_list |
Alert delta unavailable
Currently unable to show alert delta for PyPI packages.
109021
2.69%23
64.29%1709
15.86%