selfies - npm Package Compare versions

+199

selfies/bond_constraints.py

		import functools
		from itertools import product
		from typing import Dict, Set, Union

		from selfies.constants import ELEMENTS, INDEX_ALPHABET

		_DEFAULT_CONSTRAINTS = {
		"H": 1, "F": 1, "Cl": 1, "Br": 1, "I": 1,
		"B": 3, "B+1": 2, "B-1": 4,
		"O": 2, "O+1": 3, "O-1": 1,
		"N": 3, "N+1": 4, "N-1": 2,
		"C": 4, "C+1": 5, "C-1": 3,
		"P": 5, "P+1": 6, "P-1": 4,
		"S": 6, "S+1": 7, "S-1": 5,
		"?": 8
		}

		_PRESET_CONSTRAINTS = {
		"default": dict(_DEFAULT_CONSTRAINTS),
		"octet_rule": dict(_DEFAULT_CONSTRAINTS),
		"hypervalent": dict(_DEFAULT_CONSTRAINTS)
		}
		_PRESET_CONSTRAINTS["octet_rule"].update(
		{"S": 2, "S+1": 3, "S-1": 1, "P": 3, "P+1": 4, "P-1": 2}
		)
		_PRESET_CONSTRAINTS["hypervalent"].update(
		{"Cl": 7, "Br": 7, "I": 7, "N": 5}
		)

		_current_constraints = _PRESET_CONSTRAINTS["default"]


		def get_preset_constraints(name: str) -> Dict[str, int]:
		"""Returns the preset semantic constraints with the given name.

		Besides the aforementioned default constraints, :mod:`selfies` offers
		other preset constraints for convenience; namely, constraints that
		enforce the `octet rule <https://en.wikipedia.org/wiki/Octet_rule>`_
		and constraints that accommodate `hypervalent molecules
		<https://en.wikipedia.org/wiki/Hypervalent_molecule>`_.

		The differences between these constraints can be summarized as follows:

		.. table::
		:align: center
		:widths: auto

		+-----------------+-----------+---+---+-----+-----+---+-----+-----+
		\| \| Cl, Br, I \| N \| P \| P+1 \| P-1 \| S \| S+1 \| S-1 \|
		+-----------------+-----------+---+---+-----+-----+---+-----+-----+
		\| ``default`` \| 1 \| 3 \| 5 \| 6 \| 4 \| 6 \| 7 \| 5 \|
		+-----------------+-----------+---+---+-----+-----+---+-----+-----+
		\| ``octet_rule`` \| 1 \| 3 \| 3 \| 4 \| 2 \| 2 \| 3 \| 1 \|
		+-----------------+-----------+---+---+-----+-----+---+-----+-----+
		\| ``hypervalent`` \| 7 \| 5 \| 5 \| 6 \| 4 \| 6 \| 7 \| 5 \|
		+-----------------+-----------+---+---+-----+-----+---+-----+-----+

		:param name: the preset name: ``default`` or ``octet_rule`` or
		``hypervalent``.
		:return: the preset constraints with the specified name, represented
		as a dictionary which maps atoms (the keys) to their bonding capacities
		(the values).
		"""

		if name not in _PRESET_CONSTRAINTS:
		raise ValueError("unrecognized preset name '{}'".format(name))
		return dict(_PRESET_CONSTRAINTS[name])


		def get_semantic_constraints() -> Dict[str, int]:
		"""Returns the semantic constraints that :mod:`selfies` is currently
		operating on.

		:return: the current semantic constraints, represented as a dictionary
		which maps atoms (the keys) to their bonding capacities (the values).
		"""

		global _current_constraints
		return dict(_current_constraints)


		def set_semantic_constraints(
		bond_constraints: Union[str, Dict[str, int]] = "default"
		) -> None:
		"""Updates the semantic constraints that :mod:`selfies` operates on.

		If the input is a string, the new constraints are taken to be
		the preset named ``bond_constraints``
		(see :func:`selfies.get_preset_constraints`).

		Otherwise, the input is a dictionary representing the new constraints.
		This dictionary maps atoms (the keys) to non-negative bonding
		capacities (the values); the atoms are specified by strings
		of the form ``E`` or ``E+C`` or ``E-C``,
		where ``E`` is an element symbol and ``C`` is a positive integer.
		For example, one may have:

		* ``bond_constraints["I-1"] = 0``
		* ``bond_constraints["C"] = 4``

		This dictionary must also contain the special ``?`` key, which indicates
		the bond capacities of all atoms that are not explicitly listed
		in the dictionary.

		:param bond_constraints: the name of a preset, or a dictionary
		representing the new semantic constraints.
		:return: ``None``.
		"""

		global _current_constraints

		if isinstance(bond_constraints, str):
		_current_constraints = get_preset_constraints(bond_constraints)

		elif isinstance(bond_constraints, dict):

		# error checking
		if "?" not in bond_constraints:
		raise ValueError("bond_constraints missing '?' as a key")

		for key, value in bond_constraints.items():

		# error checking for keys
		j = max(key.find("+"), key.find("-"))
		if key == "?":
		valid = True
		elif j == -1:
		valid = (key in ELEMENTS)
		else:
		valid = (key[:j] in ELEMENTS) and key[j + 1:].isnumeric()
		if not valid:
		err_msg = "invalid key '{}' in bond_constraints".format(key)
		raise ValueError(err_msg)

		# error checking for values
		if not (isinstance(value, int) and value >= 0):
		err_msg = "invalid value at " \
		"bond_constraints['{}'] = {}".format(key, value)
		raise ValueError(err_msg)

		_current_constraints = dict(bond_constraints)

		else:
		raise ValueError("bond_constraints must be a str or dict")

		# clear cache since we changed alphabet
		get_semantic_robust_alphabet.cache_clear()
		get_bonding_capacity.cache_clear()


		@functools.lru_cache()
		def get_semantic_robust_alphabet() -> Set[str]:
		"""Returns a subset of all SELFIES symbols that are constrained
		by :mod:`selfies` under the current semantic constraints.

		:return: a subset of all SELFIES symbols that are semantically constrained.
		"""

		alphabet_subset = set()
		bonds = {"": 1, "=": 2, "#": 3}

		# add atomic symbols
		for (a, c), (b, m) in product(_current_constraints.items(), bonds.items()):
		if (m > c) or (a == "?"):
		continue
		symbol = "[{}{}]".format(b, a)
		alphabet_subset.add(symbol)

		# add branch and ring symbols
		for i in range(1, 4):
		alphabet_subset.add("[Ring{}]".format(i))
		alphabet_subset.add("[=Ring{}]".format(i))
		alphabet_subset.add("[Branch{}]".format(i))
		alphabet_subset.add("[=Branch{}]".format(i))
		alphabet_subset.add("[#Branch{}]".format(i))

		alphabet_subset.update(INDEX_ALPHABET)

		return alphabet_subset


		@functools.lru_cache()
		def get_bonding_capacity(element: str, charge: int) -> int:
		"""Returns the bonding capacity of a given atom, under the current
		semantic constraints.

		:param element: the element of the input atom.
		:param charge: the charge of the input atom.
		:return: the bonding capacity of the input atom.
		"""

		key = element
		if charge != 0:
		key += "{:+}".format(charge)

		if key in _current_constraints:
		return _current_constraints[key]
		else:
		return _current_constraints["?"]

+47

selfies/compatibility.py

		from selfies.utils.smiles_utils import atom_to_smiles, smiles_to_atom


		def modernize_symbol(symbol):
		"""Converts a SELFIES symbol from <v2 to its latest equivalent.

		:param symbol: an old SELFIES symbol.
		:return: the latest equivalent of the input symbol, or the input symbol
		itself, if no such equivalent exists.
		"""

		if symbol in _SYMBOL_UPDATE_TABLE:
		return _SYMBOL_UPDATE_TABLE[symbol]

		if symbol[-5:] == "expl]": # e.g. [XXXexpl]
		if symbol[1] in "=#/\\":
		bond_char, atom_symbol = symbol[1], symbol[2:-5]
		else:
		bond_char, atom_symbol = "", symbol[1:-5]

		atom = smiles_to_atom("[{}]".format(atom_symbol))
		if (atom is not None) and (not atom.is_aromatic):
		atom_symbol = atom_to_smiles(atom, brackets=False) # standardize
		symbol = "[{}{}]".format(bond_char, atom_symbol)

		return symbol


		def _build_update_table():
		update_table = dict()
		for L in range(1, 4):
		entries = [
		("[Branch{}_1]", "[Branch{}]"),
		("[Branch{}_2]", "[=Branch{}]"),
		("[Branch{}_3]", "[#Branch{}]"),
		("[Expl=Ring{}]", "[=Ring{}]"),
		("[Expl#Ring{}]", "[#Ring{}]"),
		("[Expl/Ring{}]", "[//Ring{}]"),
		("[Expl\\Ring{}]", "[\\\\Ring{}]")
		]

		for old, new in entries:
		update_table[old.format(L)] = new.format(L)
		return update_table


		_SYMBOL_UPDATE_TABLE = _build_update_table()

+38

selfies/constants.py

		ELEMENTS = {
		"H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg",
		"Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca", "Sc", "Ti", "V", "Cr",
		"Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br",
		"Kr", "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd",
		"Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe", "Cs", "Ba", "Hf",
		"Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi",
		"Po", "At", "Rn", "Fr", "Ra", "Rf", "Db", "Sg", "Bh", "Hs", "Mt",
		"Ds", "Rg", "Cn", "Fl", "Lv", "La", "Ce", "Pr", "Nd", "Pm", "Sm",
		"Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Ac", "Th",
		"Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md",
		"No", "Lr"
		}

		ORGANIC_SUBSET = {"B", "C", "N", "O", "S", "P", "F", "Cl", "Br", "I"}

		AROMATIC_VALENCES = {
		"B": (3,), "Al": (3,),
		"C": (4,), "Si": (4,),
		"N": (3, 5), "P": (3, 5), "As": (3, 5),
		"O": (2, 4), "S": (2, 4), "Se": (2, 4), "Te": (2, 4)
		}

		AROMATIC_SUBSET = set(e.lower() for e in AROMATIC_VALENCES)

		# =============================================================================
		# SELFIES-specific constants
		# =============================================================================


		INDEX_ALPHABET = (
		"[C]", "[Ring1]", "[Ring2]",
		"[Branch1]", "[=Branch1]", "[#Branch1]",
		"[Branch2]", "[=Branch2]", "[#Branch2]",
		"[O]", "[N]", "[=N]", "[=C]", "[#C]", "[S]", "[P]"
		)

		INDEX_CODE = {c: i for i, c in enumerate(INDEX_ALPHABET)}

+36

selfies/exceptions.py

		class SMILESParserError(ValueError):
		"""Exception raised when a SMILES fails to be parsed.
		"""

		def __init__(self, smiles, reason="N/A", idx=-1):
		self.smiles = smiles
		self.idx = idx
		self.reason = reason

		def __str__(self):
		err_msg = "\n" \
		"\tSMILES: {smiles}\n" \
		"\t {pointer}\n" \
		"\tIndex: {index}\n" \
		"\tReason: {reason}"

		return err_msg.format(
		smiles=self.smiles,
		pointer=(" " * self.idx + "^"),
		index=self.idx,
		reason=self.reason
		)


		class EncoderError(Exception):
		"""Exception raised by :func:`selfies.encoder`.
		"""

		pass


		class DecoderError(Exception):
		"""Exception raised by :func:`selfies.decoder`.
		"""

		pass

+258

selfies/mol_graph.py

		import functools
		import itertools
		from typing import List, Optional, Union

		from selfies.bond_constraints import get_bonding_capacity
		from selfies.constants import AROMATIC_VALENCES
		from selfies.utils.matching_utils import find_perfect_matching


		class Atom:
		"""An atom with associated specifications (e.g. charge, chirality).
		"""

		def __init__(
		self,
		element: str,
		is_aromatic: bool,
		isotope: Optional[int] = None,
		chirality: Optional[str] = None,
		h_count: Optional[int] = None,
		charge: int = 0
		):
		self.index = None
		self.element = element
		self.is_aromatic = is_aromatic
		self.isotope = isotope
		self.chirality = chirality
		self.h_count = h_count
		self.charge = charge

		@property
		@functools.lru_cache()
		def bonding_capacity(self):
		bond_cap = get_bonding_capacity(self.element, self.charge)
		bond_cap -= 0 if (self.h_count is None) else self.h_count
		return bond_cap

		def invert_chirality(self) -> None:
		if self.chirality == "@":
		self.chirality = "@@"
		elif self.chirality == "@@":
		self.chirality = "@"


		class DirectedBond:
		"""A bond that contains directional information.
		"""

		def __init__(
		self,
		src: int,
		dst: int,
		order: Union[int, float],
		stereo: Optional[str],
		ring_bond: bool
		):
		self.src = src
		self.dst = dst
		self.order = order
		self.stereo = stereo
		self.ring_bond = ring_bond


		class MolecularGraph:
		"""A molecular graph.

		Molecules can be viewed as weighted undirected graphs. However, SMILES
		and SELFIES strings are more naturally represented as weighted directed
		graphs, where the direction of the edges specifies the order of atoms
		and bonds in the string.
		"""

		def __init__(self):
		self._roots = list() # stores root atoms, where traversal begins
		self._atoms = list() # stores atoms in this graph
		self._bond_dict = dict() # stores all bonds in this graph
		self._adj_list = list() # adjacency list, representing this graph
		self._bond_counts = list() # stores number of bonds an atom has made
		self._ring_bond_flags = list() # stores if an atom makes a ring bond
		self._delocal_subgraph = dict() # delocalization subgraph

		def __len__(self):
		return len(self._atoms)

		def has_bond(self, a: int, b: int) -> bool:
		if a > b:
		a, b = b, a
		return (a, b) in self._bond_dict

		def has_out_ring_bond(self, src: int) -> bool:
		return self._ring_bond_flags[src]

		def get_roots(self) -> List[int]:
		return self._roots

		def get_atom(self, idx: int) -> Atom:
		return self._atoms[idx]

		def get_atoms(self) -> List[Atom]:
		return self._atoms

		def get_dirbond(self, src, dst) -> DirectedBond:
		return self._bond_dict[(src, dst)]

		def get_out_dirbonds(self, src: int) -> List[DirectedBond]:
		return self._adj_list[src]

		def get_bond_count(self, idx: int) -> int:
		return self._bond_counts[idx]

		def add_atom(self, atom: Atom, mark_root: bool = False) -> None:
		atom.index = len(self)

		if mark_root:
		self._roots.append(atom.index)
		self._atoms.append(atom)
		self._adj_list.append(list())
		self._bond_counts.append(0)
		self._ring_bond_flags.append(False)
		if atom.is_aromatic:
		self._delocal_subgraph[atom.index] = list()

		def add_bond(
		self, src: int, dst: int,
		order: Union[int, float], stereo: str
		) -> None:
		assert src < dst

		bond = DirectedBond(src, dst, order, stereo, False)
		self._add_bond_at_loc(bond, -1)
		self._bond_counts[src] += order
		self._bond_counts[dst] += order

		if order == 1.5:
		self._delocal_subgraph.setdefault(src, []).append(dst)
		self._delocal_subgraph.setdefault(dst, []).append(src)

		def add_placeholder_bond(self, src: int) -> int:
		out_edges = self._adj_list[src]
		out_edges.append(None)
		return len(out_edges) - 1

		def add_ring_bond(
		self, a: int, b: int,
		order: Union[int, float],
		a_stereo: Optional[str], b_stereo: Optional[str],
		a_pos: int = -1, b_pos: int = -1
		) -> None:
		a_bond = DirectedBond(a, b, order, a_stereo, True)
		b_bond = DirectedBond(b, a, order, b_stereo, True)
		self._add_bond_at_loc(a_bond, a_pos)
		self._add_bond_at_loc(b_bond, b_pos)
		self._bond_counts[a] += order
		self._bond_counts[b] += order
		self._ring_bond_flags[a] = True
		self._ring_bond_flags[b] = True

		if order == 1.5:
		self._delocal_subgraph.setdefault(a, []).append(b)
		self._delocal_subgraph.setdefault(b, []).append(a)

		def update_bond_order(
		self, a: int, b: int,
		new_order: Union[int, float]
		) -> None:
		assert 1 <= new_order <= 3

		if a > b:
		a, b = b, a # swap so that a < b
		a_to_b = self._bond_dict[(a, b)] # prev step guarantees existence
		if new_order == a_to_b.order:
		return
		elif a_to_b.ring_bond:
		b_to_a = self._bond_dict[(b, a)]
		bonds = (a_to_b, b_to_a)
		else:
		bonds = (a_to_b,)

		old_order = bonds[0].order
		for bond in bonds:
		bond.order = new_order
		self._bond_counts[a] += (new_order - old_order)
		self._bond_counts[b] += (new_order - old_order)

		def _add_bond_at_loc(self, bond, pos):
		self._bond_dict[(bond.src, bond.dst)] = bond

		out_edges = self._adj_list[bond.src]
		if (pos == -1) or (pos == len(out_edges)):
		out_edges.append(bond)
		elif out_edges[pos] is None:
		out_edges[pos] = bond
		else:
		out_edges.insert(pos, bond)

		def is_kekulized(self) -> bool:
		return not self._delocal_subgraph

		def kekulize(self) -> bool:
		# Algorithm based on Depth-First article by Richard L. Apodaca
		# Reference:
		# https://depth-first.com/articles/2020/02/10/
		# a-comprehensive-treatment-of-aromaticity-in-the-smiles-language/

		if self.is_kekulized():
		return True

		ds = self._delocal_subgraph
		kept_nodes = set(itertools.filterfalse(self._prune_from_ds, ds))

		# relabel kept DS nodes to be 0, 1, 2, ...
		label_to_node = list(sorted(kept_nodes))
		node_to_label = {v: i for i, v in enumerate(label_to_node)}

		# pruned and relabelled DS
		pruned_ds = [list() for _ in range(len(kept_nodes))]
		for node in kept_nodes:
		label = node_to_label[node]
		for adj in filter(lambda v: v in kept_nodes, ds[node]):
		pruned_ds[label].append(node_to_label[adj])

		matching = find_perfect_matching(pruned_ds)
		if matching is None:
		return False

		# de-aromatize and then make double bonds
		for node in ds:
		for adj in ds[node]:
		self.update_bond_order(node, adj, new_order=1)
		self._atoms[node].is_aromatic = False
		self._bond_counts[node] = int(self._bond_counts[node])

		for matched_labels in enumerate(matching):
		matched_nodes = tuple(label_to_node[i] for i in matched_labels)
		self.update_bond_order(*matched_nodes, new_order=2)

		self._delocal_subgraph = dict() # clear DS
		return True

		def _prune_from_ds(self, node):
		adj_nodes = self._delocal_subgraph[node]
		if not adj_nodes:
		return True # aromatic atom with no aromatic bonds

		atom = self._atoms[node]
		valences = AROMATIC_VALENCES[atom.element]

		# each bond in DS has order 1.5 - we treat them as single bonds
		used_electrons = int(self._bond_counts[node] - 0.5 * len(adj_nodes))

		if atom.h_count is None: # account for implicit Hs
		assert atom.charge == 0
		return any(used_electrons == v for v in valences)
		else:
		valence = valences[-1] - atom.charge
		used_electrons += atom.h_count
		free_electrons = valence - used_electrons
		return not ((free_electrons >= 0) and (free_electrons % 2 != 0))

selfies/utils/__init__.py

+196

selfies/utils/encoding_utils.py

		from typing import Dict, List, Tuple, Union

		from selfies.utils.selfies_utils import len_selfies, split_selfies


		def selfies_to_encoding(
		selfies: str,
		vocab_stoi: Dict[str, int],
		pad_to_len: int = -1,
		enc_type: str = 'both'
		) -> Union[List[int], List[List[int]], Tuple[List[int], List[List[int]]]]:
		"""Converts a SELFIES string into its label (integer)
		and/or one-hot encoding.

		A label encoded output will be a list of shape ``(L,)`` and a
		one-hot encoded output will be a 2D list of shape ``(L, len(vocab_stoi))``,
		where ``L`` is the symbol length of the SELFIES string. Optionally,
		the SELFIES string can be padded before it is encoded.

		:param selfies: the SELFIES string to be encoded.
		:param vocab_stoi: a dictionary that maps SELFIES symbols to indices,
		which must be non-negative and contiguous, starting from 0.
		If the SELFIES string is to be padded, then the special padding symbol
		``[nop]`` must also be a key in this dictionary.
		:param pad_to_len: the length that the SELFIES string string is padded to.
		If this value is less than or equal to the symbol length of the
		SELFIES string, then no padding is added. Defaults to ``-1``.
		:param enc_type: the type of encoding of the output:
		``label`` or ``one_hot`` or ``both``.
		If this value is ``both``, then a tuple of the label and one-hot
		encodings is returned. Defaults to ``both``.
		:return: the label encoded and/or one-hot encoded SELFIES string.

		:Example:

		>>> import selfies as sf
		>>> sf.selfies_to_encoding("[C][F]", {"[C]": 0, "[F]": 1})
		([0, 1], [[1, 0], [0, 1]])
		"""

		# some error checking
		if enc_type not in ("label", "one_hot", "both"):
		raise ValueError("enc_type must be in ('label', 'one_hot', 'both')")

		# pad with [nop]
		if pad_to_len > len_selfies(selfies):
		selfies += "[nop]" * (pad_to_len - len_selfies(selfies))

		# integer encode
		char_list = split_selfies(selfies)
		integer_encoded = [vocab_stoi[char] for char in char_list]

		if enc_type == "label":
		return integer_encoded

		# one-hot encode
		one_hot_encoded = list()
		for index in integer_encoded:
		letter = [0] * len(vocab_stoi)
		letter[index] = 1
		one_hot_encoded.append(letter)

		if enc_type == "one_hot":
		return one_hot_encoded
		return integer_encoded, one_hot_encoded


		def encoding_to_selfies(
		encoding: Union[List[int], List[List[int]]],
		vocab_itos: Dict[int, str],
		enc_type: str,
		) -> str:
		"""Converts a label (integer) or one-hot encoding into a SELFIES string.

		If the input is label encoded, then a list of shape ``(L,)`` is
		expected; and if the input is one-hot encoded, then a 2D list of
		shape ``(L, len(vocab_itos))`` is expected.

		:param encoding: a label or one-hot encoding.
		:param vocab_itos: a dictionary that maps indices to SELFIES symbols.
		The indices of this dictionary must be non-negative and contiguous,
		starting from 0.
		:param enc_type: the type of encoding of the input:
		``label`` or ``one_hot``.
		:return: the SELFIES string represented by the input encoding.

		:Example:

		>>> import selfies as sf
		>>> one_hot = [[0, 1, 0], [0, 0, 1], [1, 0, 0]]
		>>> vocab_itos = {0: "[nop]", 1: "[C]", 2: "[F]"}
		>>> sf.encoding_to_selfies(one_hot, vocab_itos, enc_type="one_hot")
		'[C][F][nop]'
		"""

		if enc_type not in ("label", "one_hot"):
		raise ValueError("enc_type must be in ('label', 'one_hot')")

		if enc_type == "one_hot": # Get integer encoding
		integer_encoded = []
		for row in encoding:
		integer_encoded.append(row.index(1))
		else:
		integer_encoded = encoding

		# Integer encoding -> SELFIES
		char_list = [vocab_itos[i] for i in integer_encoded]
		selfies = "".join(char_list)

		return selfies


		def batch_selfies_to_flat_hot(
		selfies_batch: List[str],
		vocab_stoi: Dict[str, int],
		pad_to_len: int = -1,
		) -> List[List[int]]:
		"""Converts a list of SELFIES strings into its list of flattened
		one-hot encodings.

		Each SELFIES string in the input list is one-hot encoded
		(and then flattened) using :func:`selfies.selfies_to_encoding`, with
		``vocab_stoi`` and ``pad_to_len`` being passed in as arguments.

		:param selfies_batch: the list of SELFIES strings to be encoded.
		:param vocab_stoi: a dictionary that maps SELFIES symbols to indices.
		:param pad_to_len: the length that each SELFIES string in the input list
		is padded to. Defaults to ``-1``.
		:return: the flattened one-hot encodings of the input list.

		:Example:

		>>> import selfies as sf
		>>> batch = ["[C]", "[C][C]"]
		>>> vocab_stoi = {"[nop]": 0, "[C]": 1}
		>>> sf.batch_selfies_to_flat_hot(batch, vocab_stoi, 2)
		[[0, 1, 1, 0], [0, 1, 0, 1]]
		"""

		hot_list = list()

		for selfies in selfies_batch:
		one_hot = selfies_to_encoding(selfies, vocab_stoi, pad_to_len,
		enc_type="one_hot")
		flattened = [elem for vec in one_hot for elem in vec]
		hot_list.append(flattened)

		return hot_list


		def batch_flat_hot_to_selfies(
		one_hot_batch: List[List[int]],
		vocab_itos: Dict[int, str],
		) -> List[str]:
		"""Converts a list of flattened one-hot encodings into a list
		of SELFIES strings.

		Each encoding in the input list is unflattened and then decoded using
		:func:`selfies.encoding_to_selfies`, with ``vocab_itos`` being passed in
		as an argument.

		:param one_hot_batch: a list of flattened one-hot encodings. Each
		encoding must be a list of length divisible by ``len(vocab_itos)``.
		:param vocab_itos: a dictionary that maps indices to SELFIES symbols.
		:return: the list of SELFIES strings represented by the input encodings.

		:Example:

		>>> import selfies as sf
		>>> batch = [[0, 1, 1, 0], [0, 1, 0, 1]]
		>>> vocab_itos = {0: "[nop]", 1: "[C]"}
		>>> sf.batch_flat_hot_to_selfies(batch, vocab_itos)
		['[C][nop]', '[C][C]']
		"""

		selfies_list = []

		for flat_one_hot in one_hot_batch:

		# Reshape to an L x M array where each column represents an alphabet
		# entry and each row is a position in the selfies
		one_hot = []

		M = len(vocab_itos)
		if len(flat_one_hot) % M != 0:
		raise ValueError("size of vector in one_hot_batch not divisible "
		"by the length of the vocabulary.")
		L = len(flat_one_hot) // M

		for i in range(L):
		one_hot.append(flat_one_hot[M * i: M * (i + 1)])

		selfies = encoding_to_selfies(one_hot, vocab_itos, enc_type="one_hot")
		selfies_list.append(selfies)

		return selfies_list

+63

selfies/utils/linked_list.py

		from typing import Any


		class SinglyLinkedList:
		"""A simple singly linked list that supports O(1) append and O(1) extend.
		"""

		def __init__(self):
		self._head = None
		self._tail = None
		self._count = 0

		def __len__(self):
		return self._count

		def __iter__(self):
		return SinglyLinkedListIterator(self)

		@property
		def head(self):
		return self._head

		def append(self, item: Any) -> None:
		node = [item, None]

		if self._head is None:
		self._head = node
		self._tail = node
		else:
		self._tail[1] = node
		self._tail = node
		self._count += 1

		def extend(self, other) -> None:
		assert isinstance(other, SinglyLinkedList)

		if other._head is None:
		return

		if self._head is None:
		self._head = other._head
		self._tail = other._tail
		else:
		self._tail[1] = other._head
		self._tail = other._tail
		self._count += len(other)


		class SinglyLinkedListIterator:

		def __init__(self, linked_list):
		self._curr = linked_list.head

		def __iter__(self):
		return self

		def __next__(self):
		if self._curr is None:
		raise StopIteration
		else:
		item = self._curr[0]
		self._curr = self._curr[1]
		return item

+109

selfies/utils/matching_utils.py

		import heapq
		import itertools
		from collections import deque
		from typing import List, Optional


		def find_perfect_matching(graph: List[List[int]]) -> Optional[List[int]]:
		"""Finds a perfect matching for an undirected graph (without self-loops).

		:param graph: an adjacency list representing the input graph.
		:return: a list representing a perfect matching, where j is the i-th
		element if nodes i and j are matched. Returns None, if the graph cannot
		be perfectly matched.
		"""

		# start with a maximal matching for efficiency
		matching = _greedy_matching(graph)

		unmatched = set(i for i in range(len(graph)) if matching[i] is None)
		while unmatched:

		# find augmenting path which starts at root
		root = unmatched.pop()
		path = _find_augmenting_path(graph, root, matching)

		if path is None:
		return None
		else:
		_flip_augmenting_path(matching, path)
		unmatched.discard(path[0])
		unmatched.discard(path[-1])

		return matching


		def _greedy_matching(graph):
		matching = [None] * len(graph)
		free_degrees = [len(graph[i]) for i in range(len(graph))]
		# free_degrees[i] = number of unmatched neighbors for node i

		# prioritize nodes with fewer unmatched neighbors
		node_pqueue = [(free_degrees[i], i) for i in range(len(graph))]
		heapq.heapify(node_pqueue)

		while node_pqueue:
		_, node = heapq.heappop(node_pqueue)

		if (matching[node] is not None) or (free_degrees[node] == 0):
		continue # node cannot be matched

		# match node with first unmatched neighbor
		mate = next(i for i in graph[node] if matching[i] is None)
		matching[node] = mate
		matching[mate] = node

		for adj in itertools.chain(graph[node], graph[mate]):
		free_degrees[adj] -= 1
		if (matching[adj] is None) and (free_degrees[adj] > 0):
		heapq.heappush(node_pqueue, (free_degrees[adj], adj))

		return matching


		def _find_augmenting_path(graph, root, matching):
		assert matching[root] is None

		# run modified BFS to find path from root to unmatched node
		other_end = None
		node_queue = deque([root])

		# parent BFS tree - None indicates an unvisited node
		parents = [None] * len(graph)
		parents[root] = [None, None]

		while node_queue:
		node = node_queue.popleft()

		for adj in graph[node]:
		if matching[adj] is None: # unmatched node
		if adj != root: # augmenting path found!
		parents[adj] = [node, adj]
		other_end = adj
		break
		else:
		adj_mate = matching[adj]
		if parents[adj_mate] is None: # adj_mate not visited
		parents[adj_mate] = [node, adj]
		node_queue.append(adj_mate)

		if other_end is not None:
		break # augmenting path found!

		if other_end is None:
		return None
		else:
		path = []
		node = other_end
		while node != root:
		path.append(parents[node][1])
		path.append(parents[node][0])
		node = parents[node][0]
		return path


		def _flip_augmenting_path(matching, path):
		for i in range(0, len(path), 2):
		a, b = path[i], path[i + 1]
		matching[a] = b
		matching[b] = a

+72

selfies/utils/selfies_utils.py

		from typing import Iterable, Iterator, Set


		def len_selfies(selfies: str) -> int:
		"""Returns the number of symbols in a given SELFIES string.

		:param selfies: a SELFIES string.
		:return: the symbol length of the SELFIES string.

		:Example:

		>>> import selfies as sf
		>>> sf.len_selfies("[C][=C][F].[C]")
		5
		"""

		return selfies.count("[") + selfies.count(".")


		def split_selfies(selfies: str) -> Iterator[str]:
		"""Tokenizes a SELFIES string into its individual symbols.

		:param selfies: a SELFIES string.
		:return: the symbols of the SELFIES string one-by-one with order preserved.

		:Example:

		>>> import selfies as sf
		>>> list(sf.split_selfies("[C][=C][F].[C]"))
		['[C]', '[=C]', '[F]', '.', '[C]']
		"""

		left_idx = selfies.find("[")

		while 0 <= left_idx < len(selfies):
		right_idx = selfies.find("]", left_idx + 1)
		if right_idx == -1:
		raise ValueError("malformed SELFIES string, hanging '[' bracket")

		next_symbol = selfies[left_idx: right_idx + 1]
		yield next_symbol

		left_idx = right_idx + 1
		if selfies[left_idx: left_idx + 1] == ".":
		yield "."
		left_idx += 1


		def get_alphabet_from_selfies(selfies_iter: Iterable[str]) -> Set[str]:
		"""Constructs an alphabet from an iterable of SELFIES strings.

		The returned alphabet is the set of all symbols that appear in the
		SELFIES strings from the input iterable, minus the dot ``.`` symbol.

		:param selfies_iter: an iterable of SELFIES strings.
		:return: an alphabet of SELFIES symbols, built from the input iterable.

		:Example:

		>>> import selfies as sf
		>>> selfies_list = ["[C][F][O]", "[C].[O]", "[F][F]"]
		>>> alphabet = sf.get_alphabet_from_selfies(selfies_list)
		>>> sorted(list(alphabet))
		['[C]', '[F]', '[O]']
		"""

		alphabet = set()
		for s in selfies_iter:
		for symbol in split_selfies(s):
		alphabet.add(symbol)
		alphabet.discard(".")
		return alphabet

+425

selfies/utils/smiles_utils.py

		import enum
		import re
		from collections import deque
		from typing import Iterator, Optional, Tuple, Union

		from selfies.constants import AROMATIC_SUBSET, ELEMENTS, ORGANIC_SUBSET
		from selfies.exceptions import SMILESParserError
		from selfies.mol_graph import Atom, DirectedBond, MolecularGraph

		SMILES_BRACKETED_ATOM_PATTERN = re.compile(
		r"^[\[]" # opening square bracket [
		r"(\d*)" # isotope number (optional, e.g. 123, 26)
		r"([A-Za-z][a-z]?)" # element symbol
		r"([@]{0,2})" # chiral_tag (optional, only @ and @@ supported)
		r"((?:[H]\d?)?)" # H count (optional, e.g. H, H0, H3)
		r"((?:[+]+\|[-]+\|[+-]\d+)?)" # charge (optional, e.g. ---, +1, ++)
		r"((?:[:]\d+)?)" # atom class (optional, e.g. :12, :1)
		r"[]]$" # closing square bracket ]
		)

		SMILES_BOND_ORDERS = {"-": 1, "/": 1, "\\": 1, ":": 1.5, "=": 2, "#": 3}
		SMILES_STEREO_BONDS = {"/", "\\"}


		class SMILESTokenTypes(enum.Enum):
		ATOM = 0
		BRANCH = 1
		RING = 2
		DOT = 3


		class SMILESToken:
		"""A token in a SMILES string, containing a symbol (atom, branch bracket,
		ring number, dot) and its preceding bond, if it exists (e.g. =C, %12, #N).
		"""

		def __init__(
		self,
		bond_idx: Optional[int],
		start_idx: int, end_idx: int, token_type: SMILESTokenTypes
		):
		self.bond_idx = bond_idx
		self.start_idx = start_idx
		self.end_idx = end_idx
		self.token_type = token_type

		def extract_bond_char(self, smiles):
		return None if (self.bond_idx is None) else smiles[self.bond_idx]

		def extract_symbol(self, smiles):
		return smiles[self.start_idx:self.end_idx]


		def tokenize_smiles(smiles: str) -> Iterator[SMILESToken]:
		"""Splits a SMILES string into its tokens.

		:param smiles: the input SMILES string.
		:return: the tokens of the input SMILES one-by-one with order preserved.
		"""

		i = 0
		while i < len(smiles):

		if smiles[i] == ".":
		yield SMILESToken(None, i, i + 1, SMILESTokenTypes.DOT)
		i += 1
		continue

		if smiles[i] in SMILES_BOND_ORDERS:
		bond_idx = i
		i += 1
		else:
		bond_idx = None

		if i == len(smiles):
		raise SMILESParserError(smiles, "hanging bond", i - 1)

		elif smiles[i].isalpha(): # organic subset elements
		if smiles[i: i + 2] in ("Br", "Cl"): # two-letter elements
		token = SMILESToken(bond_idx, i, i + 2, SMILESTokenTypes.ATOM)
		else: # one-letter elements (e.g. C, N, ...)
		token = SMILESToken(bond_idx, i, i + 1, SMILESTokenTypes.ATOM)

		elif smiles[i] == "[": # atoms encased in brackets (e.g. [NH])
		r_idx = smiles.find("]", i + 1)
		if r_idx == -1:
		raise SMILESParserError(smiles, "hanging bracket [", i)
		token = SMILESToken(bond_idx, i, r_idx + 1, SMILESTokenTypes.ATOM)

		elif smiles[i] in ("(", ")"): # open and closed branch brackets
		if bond_idx is not None:
		raise SMILESParserError(smiles, "hanging_bond", bond_idx)
		token = SMILESToken(None, i, i + 1, SMILESTokenTypes.BRANCH)

		elif smiles[i].isdigit(): # one-digit ring number
		token = SMILESToken(bond_idx, i, i + 1, SMILESTokenTypes.RING)

		elif smiles[i] == "%": # two-digit ring number (e.g. %12)
		rnum = smiles[i + 1: i + 3]
		if not (rnum.isnumeric() and len(rnum) == 2):
		err_msg = "invalid ring number '%{}'".format(rnum)
		raise SMILESParserError(smiles, err_msg, i)
		token = SMILESToken(bond_idx, i, i + 3, SMILESTokenTypes.RING)

		else:
		err_msg = "unrecognized symbol '{}'".format(smiles[i])
		raise SMILESParserError(smiles, err_msg, i)

		yield token
		i = token.end_idx


		# =============================================================================
		# SMILES -> Atom, Graph, etc.
		# =============================================================================


		def smiles_to_atom(atom_symbol: str) -> Optional[Atom]:
		"""Reads an atom from its SMILES representation.

		:param atom_symbol: a SMILES atom symbol.
		:return: the atom that the input symbol represents.
		"""

		if atom_symbol[0] == "[" and atom_symbol[-1] == "]":
		pass # continue below
		elif atom_symbol in ORGANIC_SUBSET: # e.g. C, N, O, ...
		return Atom(atom_symbol, False)
		elif atom_symbol in AROMATIC_SUBSET: # e.g. c, n, o, ...
		return Atom(atom_symbol.capitalize(), True)
		else:
		return None

		# e.g. [C], [C@@H], [O-], ...
		m = SMILES_BRACKETED_ATOM_PATTERN.match(atom_symbol)
		if m is None:
		return None
		isotope, element, chirality, h_count, charge, _ = m.groups()

		isotope = None if (isotope == "") else int(isotope)
		is_aromatic = element.islower() and (element in AROMATIC_SUBSET)
		element = element.capitalize()
		if element not in ELEMENTS:
		return None
		chirality = None if (chirality == "") else chirality

		s = h_count
		if s == "":
		h_count = 0
		else:
		s = s[1:] # HXXX -> XXX
		h_count = 1 if (s == "") else int(s)

		s = charge
		if s == "":
		charge = 0
		else:
		if s[-1].isdigit(): # (+/-)XXX
		charge = int(s[1:])
		else: # +++... or ---....
		charge = len(s)
		charge *= 1 if s[0] == "+" else -1

		return Atom(
		element=element,
		is_aromatic=is_aromatic,
		isotope=isotope,
		chirality=chirality,
		h_count=h_count,
		charge=charge
		)


		def smiles_to_bond(
		bond_char: Optional[str]
		) -> Tuple[Union[int, float], Optional[str]]:
		"""Reads a bond from its SMILES representation.

		:param bond_char: a SMILES bond symbol.
		:return: the order and stereochemical specification of the bond
		that the input symbol represents.
		"""

		order = SMILES_BOND_ORDERS.get(bond_char, 1)
		stereo = bond_char if (bond_char in SMILES_STEREO_BONDS) else None
		return order, stereo


		def smiles_to_mol(smiles: str) -> MolecularGraph:
		"""Reads a molecular graph from a SMILES string.

		:param smiles: the input SMILES string.
		:return: a molecular graph that the input SMILES string represents.
		:raises SMILESParserError: if the input SMILES is invalid.
		"""

		if smiles == "":
		raise SMILESParserError(smiles, "empty SMILES", 0)

		mol = MolecularGraph()
		tokens = deque(tokenize_smiles(smiles))
		while tokens:
		_derive_mol_from_tokens(mol, smiles, tokens)
		return mol


		def _derive_mol_from_tokens(mol, smiles, tokens):
		tok = None
		prev_stack = deque() # keep track of previous atom on the current chain
		branch_stack = deque() # keep track of open branches
		ring_log = dict() # keep track of hanging ring numbers
		chain_start = True

		prev_stack.append(tok)
		while tokens:
		tok = tokens.popleft()
		bond_char = tok.extract_bond_char(smiles)
		symbol, symbol_type = tok.extract_symbol(smiles), tok.token_type
		prev_atom = prev_stack[-1]

		if symbol_type == SMILESTokenTypes.DOT:
		break

		elif symbol_type == SMILESTokenTypes.ATOM:
		curr = smiles_to_atom(symbol)
		if curr is None:
		err_msg = "invalid atom symbol '{}'".format(symbol)
		raise SMILESParserError(smiles, err_msg, tok.start_idx)

		curr = _attach_atom(mol, bond_char, curr, prev_atom)
		prev_stack.pop()
		prev_stack.append(curr)
		chain_start = False

		elif chain_start:
		err_msg = "SMILES chain begins with non-atom"
		raise SMILESParserError(smiles, err_msg, tok.start_idx)

		elif symbol_type == SMILESTokenTypes.BRANCH:
		if symbol == "(":
		branch_stack.append(tok)
		prev_stack.append(prev_atom)
		chain_start = True
		else:
		if not branch_stack:
		err_msg = "hanging ')' bracket"
		raise SMILESParserError(smiles, err_msg, tok.start_idx)
		branch_stack.pop()
		prev_stack.pop()

		elif symbol_type == SMILESTokenTypes.RING:
		if symbol not in ring_log:
		lpos = mol.add_placeholder_bond(src=prev_atom.index)
		ring_log[symbol] = (tok, prev_atom, lpos)
		else:
		ltoken, latom, lpos = ring_log.pop(symbol)
		_make_ring_bonds(
		mol=mol, smiles=smiles,
		ltoken=ltoken, latom=latom, lpos=lpos,
		rtoken=tok, ratom=prev_atom
		)

		else:
		# should not happen
		raise Exception("invalid symbol type")

		if len(mol) == 0:
		err_idx = (len(smiles) if (tok is None) else tok.start_idx) - 1
		raise SMILESParserError(smiles, "empty SMILES fragment", err_idx)

		if branch_stack:
		err_idx = branch_stack[-1].start_idx
		raise SMILESParserError(smiles, "hanging '(' bracket", err_idx)

		if ring_log:
		rnum, (tok, _, _) = list(ring_log.items())[-1]
		err_msg = "hanging ring number '{}'".format(rnum)
		raise SMILESParserError(smiles, err_msg, tok.start_idx)


		def _attach_atom(mol, bond_char, atom, prev_atom):
		is_root = (prev_atom is None)
		mol.add_atom(atom, mark_root=is_root)

		if not is_root:
		src, dst = prev_atom.index, atom.index
		order, stereo = smiles_to_bond(bond_char)
		if prev_atom.is_aromatic and atom.is_aromatic and (bond_char is None):
		order = 1.5 # handle implicit aromatic bonds, e.g. cc
		mol.add_bond(src=src, dst=dst, order=order, stereo=stereo)
		return atom


		def _make_ring_bonds(mol, smiles, ltoken, latom, lpos, rtoken, ratom):
		if mol.has_bond(latom.index, ratom.index):
		err_msg = "ring bond specified between already-bonded atoms"
		raise SMILESParserError(smiles, err_msg, ltoken.start_idx)

		lbond_char = ltoken.extract_bond_char(smiles)
		rbond_char = rtoken.extract_bond_char(smiles)

		# checking that ring bonds match
		bonds = (lbond_char, rbond_char)
		if bonds[0] is None:
		bonds = (bonds[1], bonds[0])
		# swap bonds so that if bonds[0] is None, then bonds[1] is None

		if ((bonds[0] == bonds[1])
		or (bonds[1] is None)
		or all(x in SMILES_STEREO_BONDS for x in bonds)):
		pass
		else:
		err_msg = "mismatched ring bonds"
		raise SMILESParserError(smiles, err_msg, ltoken.start_idx)

		lorder, lstereo = smiles_to_bond(lbond_char)
		rorder, rstereo = smiles_to_bond(rbond_char)
		if latom.is_aromatic and ratom.is_aromatic and (bonds == (None, None)):
		lorder = rorder = 1.5 # handle implicit aromatic bonds, e.g. c1ccccc1

		mol.add_ring_bond(
		a=latom.index, a_stereo=lstereo, a_pos=lpos,
		b=ratom.index, b_stereo=rstereo,
		order=max(lorder, rorder)
		)


		# =============================================================================
		# SMILES <- Atom, Graph, etc.
		# =============================================================================


		def atom_to_smiles(atom: Atom, brackets: bool = True) -> str:
		"""Converts an atom into its SMILES representation.

		:param atom: the input atom.
		:param brackets: True, if brackets should be added around the returned
		symbol (e.g. in the case of [C] or [C@@H]). Defaults to True.
		:return: a SMILES symbol representing the input atom.
		"""
		assert not atom.is_aromatic

		specs = (atom.isotope, atom.chirality, atom.h_count, atom.charge)
		if specs == (None, None, None, 0):
		return atom.element
		else:
		builder = []
		if brackets:
		builder.append("[")
		if atom.isotope is not None:
		builder.append(str(atom.isotope))
		builder.append(atom.element)
		if atom.chirality is not None:
		builder.append(atom.chirality)
		if atom.h_count != 0:
		builder.append("H")
		builder.append(str(atom.h_count))
		elif specs == (None, None, 0, 0) and (atom.element in ORGANIC_SUBSET):
		builder.append("H0")
		if atom.charge != 0:
		builder.append("{:+}".format(atom.charge))
		if brackets:
		builder.append("]")

		return "".join(builder)


		def bond_to_smiles(bond: DirectedBond) -> str:
		"""Converts a bond into its SMILES representation.

		:param bond: the input bond.
		:return: a SMILES symbol representing the input bond.
		"""

		if bond.order == 1:
		return bond.stereo if (bond.stereo in SMILES_STEREO_BONDS) else ""
		elif bond.order == 2:
		return "="
		elif bond.order == 3:
		return "#"
		else: # this should never happen
		raise ValueError()


		def mol_to_smiles(mol: MolecularGraph) -> str:
		"""Converts a molecular graph into its SMILES representation, maintaining
		the traversal order indicated by the input graph.

		:param mol: the input molecule.
		:return: a SMILES string representing the input molecule.
		"""
		assert mol.is_kekulized()

		fragments = []
		ring_log = dict()
		for root in mol.get_roots():
		derived = []
		_derive_smiles_from_fragment(derived, mol, root, ring_log)
		fragments.append("".join(derived))
		return ".".join(fragments)


		def _derive_smiles_from_fragment(derived, mol, root, ring_log):
		curr_atom, curr = mol.get_atom(root), root
		derived.append(atom_to_smiles(curr_atom))

		out_bonds = mol.get_out_dirbonds(curr)
		for i, bond in enumerate(out_bonds):
		if bond.ring_bond:
		derived.append(bond_to_smiles(bond))
		ends = (min(bond.src, bond.dst), max(bond.src, bond.dst))
		rnum = ring_log.setdefault(ends, len(ring_log) + 1)
		if rnum >= 10:
		derived.append("%")
		derived.append(str(rnum))

		else:
		if i < len(out_bonds) - 1:
		derived.append("(")

		derived.append(bond_to_smiles(bond))
		_derive_smiles_from_fragment(derived, mol, bond.dst, ring_log)

		if i < len(out_bonds) - 1:
		derived.append(")")

+89

-98

PKG-INFO

		Metadata-Version: 2.1
		Name: selfies
		Version: 1.0.4
		Version: 2.0.0
		Summary: SELFIES (SELF-referencIng Embedded Strings) is a general-purpose, sequence-based, robust representation of semantically constrained graphs.
		Home-page: https://github.com/aspuru-guzik-group/selfies
		Author: Mario Krenn
		Author: Mario Krenn, Alston Lo, and many other contributors
		Author-email: mario.krenn@utoronto.ca, alan@aspuru.com
		@@ -16,20 +16,24 @@ License: UNKNOWN
		[![GitHub issues](https://img.shields.io/github/issues/aspuru-guzik-group/selfies.svg)](https://GitHub.com/aspuru-guzik-group/selfies/issues/)
		[![Documentation Status](https://readthedocs.org/projects/selfies/badge/?version=latest)](http://selfies.readthedocs.io/?badge=latest)
		[![Documentation Status](https://readthedocs.org/projects/selfiesv2/badge/?version=latest)](http://selfiesv2.readthedocs.io/?badge=latest)
		[![GitHub contributors](https://img.shields.io/github/contributors/aspuru-guzik-group/selfies.svg)](https://GitHub.com/aspuru-guzik-group/selfies/graphs/contributors/)


		Self-Referencing Embedded Strings (SELFIES): A 100% robust molecular string representation<br>
		_Mario Krenn, Florian Haese, AkshatKumar Nigam, Pascal Friederich, Alan Aspuru-Guzik_<br>
		[Machine Learning: Science and Technology 1, 045024 (2020)](https://iopscience.iop.org/article/10.1088/2632-2153/aba947), [extensive blog post January 2021](https://aspuru.substack.com/p/molecular-graph-representations-and).<br>
		[Talk on youtube about SELFIES](https://www.youtube.com/watch?v=CaIyUmfGXDk).<br>
		Major contributors since v1.0.0: _[Alston Lo](https://github.com/aspuru-guzik-group/selfies/commits?author=alstonlo) and [Seyone Chithrananda](https://github.com/seyonechithrananda)_<br>
		Self-Referencing Embedded Strings (SELFIES): A 100% robust molecular string representation\
		_Mario Krenn, Florian Haese, AkshatKumar Nigam, Pascal Friederich, Alan Aspuru-Guzik_\
		[Machine Learning: Science and Technology 1, 045024 (2020)](https://iopscience.iop.org/article/10.1088/2632-2153/aba947), [extensive blog post January 2021](https://aspuru.substack.com/p/molecular-graph-representations-and).\
		[Talk on youtube about SELFIES](https://www.youtube.com/watch?v=CaIyUmfGXDk).\
		[Blog explaining SELFIES in Japanese language](https://blacktanktop.hatenablog.com/entry/2021/08/12/115613)\
		Major contributors since v1.0.0: _[Alston Lo](https://github.com/alstonlo) and [Seyone Chithrananda](https://github.com/seyonechithrananda)_\
		Chemistry Advisor: [Robert Pollice](https://scholar.google.at/citations?user=JR2N3JIAAAAJ)

		A main objective is to use SELFIES as direct input into machine learning models,<br>
		in particular in generative models, for the generation of molecular graphs<br>
		---

		A main objective is to use SELFIES as direct input into machine learning models,
		in particular in generative models, for the generation of molecular graphs
		which are syntactically and semantically valid.

		<center><img src="https://github.com/aspuru-guzik-group/selfies/blob/master/examples/VAE_LS_Validity.png" alt="SELFIES validity in a VAE latent space" width="666px"></center>
		<p align="center">
		<img src="https://github.com/aspuru-guzik-group/selfies/blob/master/examples/VAE_LS_Validity.png" alt="SELFIES validity in a VAE latent space" width="666px">
		</p>


		## Installation
		@@ -52,3 +56,3 @@ Use pip to install ``selfies``.
		[CHANGELOG](https://github.com/aspuru-guzik-group/selfies/blob/master/CHANGELOG.md)
		to review the changes between versions of `selfies`:
		to review the changes between versions of `selfies`, before upgrading:

		@@ -59,28 +63,23 @@ ```bash

		## Documentation

		The documentation can be found on
		[ReadTheDocs](https://selfies.readthedocs.io/en/latest/).
		Alternatively, it can be built from the ``docs/`` directory.

		## Usage

		### Standard Functions
		### Overview

		The ``selfies`` library has eight standard functions:
		Please refer to the [documentation](https://selfiesv2.readthedocs.io/en/latest/),
		which contains a thorough tutorial for getting started with ``selfies``
		and detailed descriptions of the functions
		that ``selfies`` provides. We summarize some key functions below.

		\| Function \| Description \|
		\| -------- \| ----------- \|
		\| ``selfies.encoder`` \| Translates a SMILES into an equivalent SELFIES. \|
		\| ``selfies.decoder`` \| Translates a SELFIES into an equivalent SMILES. \|
		\| ``selfies.len_selfies`` \| Returns the (symbol) length of a SELFIES. \|
		\| ``selfies.split_selfies`` \| Splits a SELFIES into its symbols. \|
		\| ``selfies.get_alphabet_from_selfies`` \| Builds an alphabet of SELFIES symbols from an iterable of SELFIES. \|
		\| ``selfies.get_semantic_robust_alphabet`` \| Returns a subset of all SELFIES symbols that are semantically constrained. \|
		\| ``selfies.selfies_to_encoding`` \| Converts a SELFIES into a label and/or one-hot encoding. \|
		\| ``selfies.encoding_to_selfies`` \| Converts a label or one-hot encoding into a SELFIES. \|
		\| ``selfies.encoder`` \| Translates a SMILES string into its corresponding SELFIES string. \|
		\| ``selfies.decoder`` \| Translates a SELFIES string into its corresponding SMILES string. \|
		\| ``selfies.set_semantic_constraints`` \| Configures the semantic constraints that ``selfies`` operates on. \|
		\| ``selfies.len_selfies`` \| Returns the number of symbols in a SELFIES string. \|
		\| ``selfies.split_selfies`` \| Tokenizes a SELFIES string into its individual symbols. \|
		\| ``selfies.get_alphabet_from_selfies`` \| Constructs an alphabet from an iterable of SELFIES strings. \|
		\| ``selfies.selfies_to_encoding`` \| Converts a SELFIES string into its label and/or one-hot encoding. \|
		\| ``selfies.encoding_to_selfies`` \| Converts a label or one-hot encoding into a SELFIES string. \|

		Please read the documentation for more detailed descriptions of these
		functions, and to view the advanced functions, which allow users to
		customize the SELFIES language.

		@@ -96,19 +95,41 @@ ### Examples

		# SMILES --> SELFIES translation
		encoded_selfies = sf.encoder(benzene) # '[C][=C][C][=C][C][=C][Ring1][Branch1_2]'
		# SMILES -> SELFIES -> SMILES translation
		try:
		benzene_sf = sf.encoder(benzene) # [C][=C][C][=C][C][=C][Ring1][=Branch1]
		benzene_smi = sf.decoder(benzene_sf) # C1=CC=CC=C1
		except sf.EncoderError:
		pass # sf.encoder error!
		except sf.DecoderError:
		pass # sf.decoder error!

		# SELFIES --> SMILES translation
		decoded_smiles = sf.decoder(encoded_selfies) # 'C1=CC=CC=C1'
		len_benzene = sf.len_selfies(benzene_sf) # 8

		len_benzene = sf.len_selfies(encoded_selfies) # 8
		symbols_benzene = list(sf.split_selfies(benzene_sf))
		# ['[C]', '[=C]', '[C]', '[=C]', '[C]', '[=C]', '[Ring1]', '[=Branch1]']
		```

		symbols_benzene = list(sf.split_selfies(encoded_selfies))
		# ['[C]', '[=C]', '[C]', '[=C]', '[C]', '[=C]', '[Ring1]', '[Branch1_2]']
		#### Customizing SELFIES:

		In this example, we relax the semantic constraints of ``selfies`` to allow
		for hypervalences (caution: hypervalence rules are much less understood
		than octet rules. Some molecules containing hypervalences are important,
		but generally, it is not known which molecules are stable and reasonable).

		```python
		import selfies as sf

		hypervalent_sf = sf.encoder('O=I(O)(O)(O)(O)O', strict=False) # orthoperiodic acid
		standard_derived_smi = sf.decoder(hypervalent_sf)
		# OI (the default constraints for I allows for only 1 bond)

		sf.set_semantic_constraints("hypervalent")
		relaxed_derived_smi = sf.decoder(hypervalent_sf)
		# O=I(O)(O)(O)(O)O (the hypervalent constraints for I allows for 7 bonds)
		```

		#### Integer and one-hot encoding SELFIES:
		In this example we first build an alphabet
		from a dataset of SELFIES, and then convert a SELFIES into a
		padded, label-encoded representation. Note that we use the
		``'[nop]'`` ([no operation](https://en.wikipedia.org/wiki/NOP_(code) ))

		In this example, we first build an alphabet from a dataset of SELFIES strings,
		and then convert a SELFIES string into its padded encoding. Note that we use the
		``[nop]`` ([no operation](https://en.wikipedia.org/wiki/NOP_(code) ))
		symbol to pad our SELFIES, which is a special SELFIES symbol that is always
		@@ -121,7 +142,6 @@ ignored and skipped over by ``selfies.decoder``, making it a useful

		dataset = ['[C][O][C]', '[F][C][F]', '[O][=O]', '[C][C][O][C][C]']
		dataset = ["[C][O][C]", "[F][C][F]", "[O][=O]", "[C][C][O][C][C]"]
		alphabet = sf.get_alphabet_from_selfies(dataset)
		alphabet.add('[nop]') # '[nop]' is a special padding symbol
		alphabet = list(sorted(alphabet))
		print(alphabet) # ['[=O]', '[C]', '[F]', '[O]', '[nop]']
		alphabet.add("[nop]") # [nop] is a special padding symbol
		alphabet = list(sorted(alphabet)) # ['[=O]', '[C]', '[F]', '[O]', '[nop]']

		@@ -131,19 +151,15 @@ pad_to_len = max(sf.len_selfies(s) for s in dataset) # 5

		# SELFIES to label encode
		dimethyl_ether = dataset[0] # '[C][O][C]'
		dimethyl_ether = dataset[0] # [C][O][C]

		# [1, 3, 1, 4, 4]
		print(sf.selfies_to_encoding(dimethyl_ether,
		vocab_stoi=symbol_to_idx,
		pad_to_len=pad_to_len,
		enc_type='label'))

		# [[0, 1, 0, 0, 0], [0, 0, 0, 1, 0], [0, 1, 0, 0, 0], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1]]
		print(sf.selfies_to_encoding(dimethyl_ether,
		vocab_stoi=symbol_to_idx,
		pad_to_len=pad_to_len,
		enc_type='one_hot'))
		label, one_hot = sf.selfies_to_encoding(
		selfies=dimethyl_ether,
		vocab_stoi=symbol_to_idx,
		pad_to_len=pad_to_len,
		enc_type="both"
		)
		# label = [1, 3, 1, 4, 4]
		# one_hot = [[0, 1, 0, 0, 0], [0, 0, 0, 1, 0], [0, 1, 0, 0, 0], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1]]
		```

		### More Examples
		### More Usages and Examples

		@@ -158,19 +174,6 @@ * More examples can be found in the ``examples/`` directory, including a
		* Kohulan Rajan, Achim Zielesny, Christoph Steinbeck show in two papers that SELFIES outperforms other representations in [img2string](https://link.springer.com/article/10.1186/s13321-020-00469-w) and [string2string](https://chemrxiv.org/articles/preprint/STOUT_SMILES_to_IUPAC_Names_Using_Neural_Machine_Translation/13469202/1) translation tasks, see the codes of [DECIMER](https://github.com/Kohulan/DECIMER-Image-to-SMILES) and [STOUT](https://github.com/Kohulan/Smiles-TO-iUpac-Translator).
		* An improvement to the old genetic algorithm, the authors have also released [JANUS](https://arxiv.org/abs/2106.04011), which allows for more efficient optimization in the chemical space. JANUS makes use of [STONED-SELFIES](https://pubs.rsc.org/en/content/articlepdf/2021/sc/d1sc00231g) and a neural network for efficient sampling.


		## Handling invalid inputs
		If an invalid input is presented to the encoder or decoder, the return value is `None`.
		The error can be analysed by using the `encoder(...,print_error=True)` option.
		```python
		import selfies as sf
		invalid_smiles="C[C@H](O)[C@@(*)C1=CC=CC=C1"
		selfies_string=sf.encoder(invalid_smiles)

		if selfies_string==None:
		selfies_string=sf.encoder(invalid_smiles,print_error=True)
		# 'Encoding error 'C[C@H](O)[C@@()C1=CC=CC=C1': wildcard atom '' not supported.'
		```

		## Tests
		SELFIES uses `pytest` with `tox` as its testing framework.
		`selfies` uses `pytest` with `tox` as its testing framework.
		All tests can be found in the `tests/` directory. To run the test suite for
		@@ -180,29 +183,17 @@ SELFIES, install ``tox`` and run:
		```bash
		tox
		tox -- --trials=10000 --dataset_samples=10000
		```

		By default, SELFIES is tested against a random subset
		(of size ``dataset_samples=100000``) on various datasets:
		By default, `selfies` is tested against a random subset
		(of size ``dataset_samples=10000``) on various datasets:

		* 130K molecules from [QM9](https://www.nature.com/articles/sdata201422)
		* 250K molecules from [ZINC](https://en.wikipedia.org/wiki/ZINC_database)
		* 50K molecules from [non-fullerene acceptors for organic solar cells](https://www.sciencedirect.com/science/article/pii/S2542435117301307)
		* 8K molecules from [Tox21](http://moleculenet.ai/datasets-1) in MoleculeNet
		* 93K molecules from PubChem [MUV](http://moleculenet.ai/datasets-1) in MoleculeNet
		* 27M molecules from the [eMolecules Plus Database](https://www.emolecules.com/info/plus/download-database).
		* 50K molecules from a dataset of [non-fullerene acceptors for organic solar cells](https://www.sciencedirect.com/science/article/pii/S2542435117301307)
		* 160K+ molecules from various [MoleculeNet](http://moleculenet.ai/datasets-1) datasets
		* 36M+ molecules from the [eMolecules Database](https://www.emolecules.com/info/products-data-downloads.html).
		Due to its large size, this dataset is not included on the repository. To run tests
		on it, please download the dataset in the ``tests/test_sets`` directory
		and enable its pytest at ``tests/test_on_emolecules.py``.
		on it, please download the dataset into the ``tests/test_sets`` directory
		and run the ``tests/run_on_large_dataset.py`` script.

		Other tests are random and repeated ``trials`` number of times.
		These can be specified as arguments

		```bash
		tox -- --trials 100 --dataset_samples 100
		```

		where ``--trials=100000`` and ``--dataset_samples=100000`` by default. Note that
		if ``dataset_samples`` is negative or exceeds the length of the dataset,
		the whole dataset is used.

		## Version History
		@@ -213,5 +204,5 @@ See [CHANGELOG](https://github.com/aspuru-guzik-group/selfies/blob/master/CHANGELOG.md).

		We thank Jacques Boitreaud, Andrew Brereton, Matthew Carbone (x94carbone), Nathan Frey (ncfrey), Theophile Gaudin,
		HelloJocelynLu, Hyunmin Kim (hmkim), Minjie Li, Vincent Mallet, Alexander Minidis (DocMinus), Kevin Ryan (LeanAndMean),
		Benjamin Sanchez-Lengeling, and Zhenpeng Yao for their suggestions and bug reports,
		We thank Jacques Boitreaud, Andrew Brereton, Nessa Carson (supersciencegrl), Matthew Carbone (x94carbone), Vladimir Chupakhin (chupvl), Nathan Frey (ncfrey), Theophile Gaudin,
		HelloJocelynLu, Hyunmin Kim (hmkim), Minjie Li, Vincent Mallet, Alexander Minidis (DocMinus), Kohulan Rajan (Kohulan),
		Kevin Ryan (LeanAndMean), Benjamin Sanchez-Lengeling, Andrew White, Zhenpeng Yao and Adamo Young for their suggestions and bug reports,
		and Robert Pollice for chemistry advices.
		@@ -218,0 +209,0 @@

+87

-96

README.md

		@@ -8,20 +8,24 @@ # SELFIES
		[![GitHub issues](https://img.shields.io/github/issues/aspuru-guzik-group/selfies.svg)](https://GitHub.com/aspuru-guzik-group/selfies/issues/)
		[![Documentation Status](https://readthedocs.org/projects/selfies/badge/?version=latest)](http://selfies.readthedocs.io/?badge=latest)
		[![Documentation Status](https://readthedocs.org/projects/selfiesv2/badge/?version=latest)](http://selfiesv2.readthedocs.io/?badge=latest)
		[![GitHub contributors](https://img.shields.io/github/contributors/aspuru-guzik-group/selfies.svg)](https://GitHub.com/aspuru-guzik-group/selfies/graphs/contributors/)


		Self-Referencing Embedded Strings (SELFIES): A 100% robust molecular string representation<br>
		_Mario Krenn, Florian Haese, AkshatKumar Nigam, Pascal Friederich, Alan Aspuru-Guzik_<br>
		[Machine Learning: Science and Technology 1, 045024 (2020)](https://iopscience.iop.org/article/10.1088/2632-2153/aba947), [extensive blog post January 2021](https://aspuru.substack.com/p/molecular-graph-representations-and).<br>
		[Talk on youtube about SELFIES](https://www.youtube.com/watch?v=CaIyUmfGXDk).<br>
		Major contributors since v1.0.0: _[Alston Lo](https://github.com/aspuru-guzik-group/selfies/commits?author=alstonlo) and [Seyone Chithrananda](https://github.com/seyonechithrananda)_<br>
		Self-Referencing Embedded Strings (SELFIES): A 100% robust molecular string representation\
		_Mario Krenn, Florian Haese, AkshatKumar Nigam, Pascal Friederich, Alan Aspuru-Guzik_\
		[Machine Learning: Science and Technology 1, 045024 (2020)](https://iopscience.iop.org/article/10.1088/2632-2153/aba947), [extensive blog post January 2021](https://aspuru.substack.com/p/molecular-graph-representations-and).\
		[Talk on youtube about SELFIES](https://www.youtube.com/watch?v=CaIyUmfGXDk).\
		[Blog explaining SELFIES in Japanese language](https://blacktanktop.hatenablog.com/entry/2021/08/12/115613)\
		Major contributors since v1.0.0: _[Alston Lo](https://github.com/alstonlo) and [Seyone Chithrananda](https://github.com/seyonechithrananda)_\
		Chemistry Advisor: [Robert Pollice](https://scholar.google.at/citations?user=JR2N3JIAAAAJ)

		A main objective is to use SELFIES as direct input into machine learning models,<br>
		in particular in generative models, for the generation of molecular graphs<br>
		---

		A main objective is to use SELFIES as direct input into machine learning models,
		in particular in generative models, for the generation of molecular graphs
		which are syntactically and semantically valid.

		<center><img src="https://github.com/aspuru-guzik-group/selfies/blob/master/examples/VAE_LS_Validity.png" alt="SELFIES validity in a VAE latent space" width="666px"></center>
		<p align="center">
		<img src="https://github.com/aspuru-guzik-group/selfies/blob/master/examples/VAE_LS_Validity.png" alt="SELFIES validity in a VAE latent space" width="666px">
		</p>


		## Installation
		@@ -44,3 +48,3 @@ Use pip to install ``selfies``.
		[CHANGELOG](https://github.com/aspuru-guzik-group/selfies/blob/master/CHANGELOG.md)
		to review the changes between versions of `selfies`:
		to review the changes between versions of `selfies`, before upgrading:

		@@ -51,28 +55,23 @@ ```bash

		## Documentation

		The documentation can be found on
		[ReadTheDocs](https://selfies.readthedocs.io/en/latest/).
		Alternatively, it can be built from the ``docs/`` directory.

		## Usage

		### Standard Functions
		### Overview

		The ``selfies`` library has eight standard functions:
		Please refer to the [documentation](https://selfiesv2.readthedocs.io/en/latest/),
		which contains a thorough tutorial for getting started with ``selfies``
		and detailed descriptions of the functions
		that ``selfies`` provides. We summarize some key functions below.

		\| Function \| Description \|
		\| -------- \| ----------- \|
		\| ``selfies.encoder`` \| Translates a SMILES into an equivalent SELFIES. \|
		\| ``selfies.decoder`` \| Translates a SELFIES into an equivalent SMILES. \|
		\| ``selfies.len_selfies`` \| Returns the (symbol) length of a SELFIES. \|
		\| ``selfies.split_selfies`` \| Splits a SELFIES into its symbols. \|
		\| ``selfies.get_alphabet_from_selfies`` \| Builds an alphabet of SELFIES symbols from an iterable of SELFIES. \|
		\| ``selfies.get_semantic_robust_alphabet`` \| Returns a subset of all SELFIES symbols that are semantically constrained. \|
		\| ``selfies.selfies_to_encoding`` \| Converts a SELFIES into a label and/or one-hot encoding. \|
		\| ``selfies.encoding_to_selfies`` \| Converts a label or one-hot encoding into a SELFIES. \|
		\| ``selfies.encoder`` \| Translates a SMILES string into its corresponding SELFIES string. \|
		\| ``selfies.decoder`` \| Translates a SELFIES string into its corresponding SMILES string. \|
		\| ``selfies.set_semantic_constraints`` \| Configures the semantic constraints that ``selfies`` operates on. \|
		\| ``selfies.len_selfies`` \| Returns the number of symbols in a SELFIES string. \|
		\| ``selfies.split_selfies`` \| Tokenizes a SELFIES string into its individual symbols. \|
		\| ``selfies.get_alphabet_from_selfies`` \| Constructs an alphabet from an iterable of SELFIES strings. \|
		\| ``selfies.selfies_to_encoding`` \| Converts a SELFIES string into its label and/or one-hot encoding. \|
		\| ``selfies.encoding_to_selfies`` \| Converts a label or one-hot encoding into a SELFIES string. \|

		Please read the documentation for more detailed descriptions of these
		functions, and to view the advanced functions, which allow users to
		customize the SELFIES language.

		@@ -88,19 +87,41 @@ ### Examples

		# SMILES --> SELFIES translation
		encoded_selfies = sf.encoder(benzene) # '[C][=C][C][=C][C][=C][Ring1][Branch1_2]'
		# SMILES -> SELFIES -> SMILES translation
		try:
		benzene_sf = sf.encoder(benzene) # [C][=C][C][=C][C][=C][Ring1][=Branch1]
		benzene_smi = sf.decoder(benzene_sf) # C1=CC=CC=C1
		except sf.EncoderError:
		pass # sf.encoder error!
		except sf.DecoderError:
		pass # sf.decoder error!

		# SELFIES --> SMILES translation
		decoded_smiles = sf.decoder(encoded_selfies) # 'C1=CC=CC=C1'
		len_benzene = sf.len_selfies(benzene_sf) # 8

		len_benzene = sf.len_selfies(encoded_selfies) # 8
		symbols_benzene = list(sf.split_selfies(benzene_sf))
		# ['[C]', '[=C]', '[C]', '[=C]', '[C]', '[=C]', '[Ring1]', '[=Branch1]']
		```

		symbols_benzene = list(sf.split_selfies(encoded_selfies))
		# ['[C]', '[=C]', '[C]', '[=C]', '[C]', '[=C]', '[Ring1]', '[Branch1_2]']
		#### Customizing SELFIES:

		In this example, we relax the semantic constraints of ``selfies`` to allow
		for hypervalences (caution: hypervalence rules are much less understood
		than octet rules. Some molecules containing hypervalences are important,
		but generally, it is not known which molecules are stable and reasonable).

		```python
		import selfies as sf

		hypervalent_sf = sf.encoder('O=I(O)(O)(O)(O)O', strict=False) # orthoperiodic acid
		standard_derived_smi = sf.decoder(hypervalent_sf)
		# OI (the default constraints for I allows for only 1 bond)

		sf.set_semantic_constraints("hypervalent")
		relaxed_derived_smi = sf.decoder(hypervalent_sf)
		# O=I(O)(O)(O)(O)O (the hypervalent constraints for I allows for 7 bonds)
		```

		#### Integer and one-hot encoding SELFIES:
		In this example we first build an alphabet
		from a dataset of SELFIES, and then convert a SELFIES into a
		padded, label-encoded representation. Note that we use the
		``'[nop]'`` ([no operation](https://en.wikipedia.org/wiki/NOP_(code) ))

		In this example, we first build an alphabet from a dataset of SELFIES strings,
		and then convert a SELFIES string into its padded encoding. Note that we use the
		``[nop]`` ([no operation](https://en.wikipedia.org/wiki/NOP_(code) ))
		symbol to pad our SELFIES, which is a special SELFIES symbol that is always
		@@ -113,7 +134,6 @@ ignored and skipped over by ``selfies.decoder``, making it a useful

		dataset = ['[C][O][C]', '[F][C][F]', '[O][=O]', '[C][C][O][C][C]']
		dataset = ["[C][O][C]", "[F][C][F]", "[O][=O]", "[C][C][O][C][C]"]
		alphabet = sf.get_alphabet_from_selfies(dataset)
		alphabet.add('[nop]') # '[nop]' is a special padding symbol
		alphabet = list(sorted(alphabet))
		print(alphabet) # ['[=O]', '[C]', '[F]', '[O]', '[nop]']
		alphabet.add("[nop]") # [nop] is a special padding symbol
		alphabet = list(sorted(alphabet)) # ['[=O]', '[C]', '[F]', '[O]', '[nop]']

		@@ -123,19 +143,15 @@ pad_to_len = max(sf.len_selfies(s) for s in dataset) # 5

		# SELFIES to label encode
		dimethyl_ether = dataset[0] # '[C][O][C]'
		dimethyl_ether = dataset[0] # [C][O][C]

		# [1, 3, 1, 4, 4]
		print(sf.selfies_to_encoding(dimethyl_ether,
		vocab_stoi=symbol_to_idx,
		pad_to_len=pad_to_len,
		enc_type='label'))

		# [[0, 1, 0, 0, 0], [0, 0, 0, 1, 0], [0, 1, 0, 0, 0], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1]]
		print(sf.selfies_to_encoding(dimethyl_ether,
		vocab_stoi=symbol_to_idx,
		pad_to_len=pad_to_len,
		enc_type='one_hot'))
		label, one_hot = sf.selfies_to_encoding(
		selfies=dimethyl_ether,
		vocab_stoi=symbol_to_idx,
		pad_to_len=pad_to_len,
		enc_type="both"
		)
		# label = [1, 3, 1, 4, 4]
		# one_hot = [[0, 1, 0, 0, 0], [0, 0, 0, 1, 0], [0, 1, 0, 0, 0], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1]]
		```

		### More Examples
		### More Usages and Examples

		@@ -150,19 +166,6 @@ * More examples can be found in the ``examples/`` directory, including a
		* Kohulan Rajan, Achim Zielesny, Christoph Steinbeck show in two papers that SELFIES outperforms other representations in [img2string](https://link.springer.com/article/10.1186/s13321-020-00469-w) and [string2string](https://chemrxiv.org/articles/preprint/STOUT_SMILES_to_IUPAC_Names_Using_Neural_Machine_Translation/13469202/1) translation tasks, see the codes of [DECIMER](https://github.com/Kohulan/DECIMER-Image-to-SMILES) and [STOUT](https://github.com/Kohulan/Smiles-TO-iUpac-Translator).
		* An improvement to the old genetic algorithm, the authors have also released [JANUS](https://arxiv.org/abs/2106.04011), which allows for more efficient optimization in the chemical space. JANUS makes use of [STONED-SELFIES](https://pubs.rsc.org/en/content/articlepdf/2021/sc/d1sc00231g) and a neural network for efficient sampling.


		## Handling invalid inputs
		If an invalid input is presented to the encoder or decoder, the return value is `None`.
		The error can be analysed by using the `encoder(...,print_error=True)` option.
		```python
		import selfies as sf
		invalid_smiles="C[C@H](O)[C@@(*)C1=CC=CC=C1"
		selfies_string=sf.encoder(invalid_smiles)

		if selfies_string==None:
		selfies_string=sf.encoder(invalid_smiles,print_error=True)
		# 'Encoding error 'C[C@H](O)[C@@()C1=CC=CC=C1': wildcard atom '' not supported.'
		```

		## Tests
		SELFIES uses `pytest` with `tox` as its testing framework.
		`selfies` uses `pytest` with `tox` as its testing framework.
		All tests can be found in the `tests/` directory. To run the test suite for
		@@ -172,29 +175,17 @@ SELFIES, install ``tox`` and run:
		```bash
		tox
		tox -- --trials=10000 --dataset_samples=10000
		```

		By default, SELFIES is tested against a random subset
		(of size ``dataset_samples=100000``) on various datasets:
		By default, `selfies` is tested against a random subset
		(of size ``dataset_samples=10000``) on various datasets:

		* 130K molecules from [QM9](https://www.nature.com/articles/sdata201422)
		* 250K molecules from [ZINC](https://en.wikipedia.org/wiki/ZINC_database)
		* 50K molecules from [non-fullerene acceptors for organic solar cells](https://www.sciencedirect.com/science/article/pii/S2542435117301307)
		* 8K molecules from [Tox21](http://moleculenet.ai/datasets-1) in MoleculeNet
		* 93K molecules from PubChem [MUV](http://moleculenet.ai/datasets-1) in MoleculeNet
		* 27M molecules from the [eMolecules Plus Database](https://www.emolecules.com/info/plus/download-database).
		* 50K molecules from a dataset of [non-fullerene acceptors for organic solar cells](https://www.sciencedirect.com/science/article/pii/S2542435117301307)
		* 160K+ molecules from various [MoleculeNet](http://moleculenet.ai/datasets-1) datasets
		* 36M+ molecules from the [eMolecules Database](https://www.emolecules.com/info/products-data-downloads.html).
		Due to its large size, this dataset is not included on the repository. To run tests
		on it, please download the dataset in the ``tests/test_sets`` directory
		and enable its pytest at ``tests/test_on_emolecules.py``.
		on it, please download the dataset into the ``tests/test_sets`` directory
		and run the ``tests/run_on_large_dataset.py`` script.

		Other tests are random and repeated ``trials`` number of times.
		These can be specified as arguments

		```bash
		tox -- --trials 100 --dataset_samples 100
		```

		where ``--trials=100000`` and ``--dataset_samples=100000`` by default. Note that
		if ``dataset_samples`` is negative or exceeds the length of the dataset,
		the whole dataset is used.

		## Version History
		@@ -205,5 +196,5 @@ See [CHANGELOG](https://github.com/aspuru-guzik-group/selfies/blob/master/CHANGELOG.md).

		We thank Jacques Boitreaud, Andrew Brereton, Matthew Carbone (x94carbone), Nathan Frey (ncfrey), Theophile Gaudin,
		HelloJocelynLu, Hyunmin Kim (hmkim), Minjie Li, Vincent Mallet, Alexander Minidis (DocMinus), Kevin Ryan (LeanAndMean),
		Benjamin Sanchez-Lengeling, and Zhenpeng Yao for their suggestions and bug reports,
		We thank Jacques Boitreaud, Andrew Brereton, Nessa Carson (supersciencegrl), Matthew Carbone (x94carbone), Vladimir Chupakhin (chupvl), Nathan Frey (ncfrey), Theophile Gaudin,
		HelloJocelynLu, Hyunmin Kim (hmkim), Minjie Li, Vincent Mallet, Alexander Minidis (DocMinus), Kohulan Rajan (Kohulan),
		Kevin Ryan (LeanAndMean), Benjamin Sanchez-Lengeling, Andrew White, Zhenpeng Yao and Adamo Young for their suggestions and bug reports,
		and Robert Pollice for chemistry advices.
		@@ -210,0 +201,0 @@

+89

-98

selfies.egg-info/PKG-INFO

		Metadata-Version: 2.1
		Name: selfies
		Version: 1.0.4
		Version: 2.0.0
		Summary: SELFIES (SELF-referencIng Embedded Strings) is a general-purpose, sequence-based, robust representation of semantically constrained graphs.
		Home-page: https://github.com/aspuru-guzik-group/selfies
		Author: Mario Krenn
		Author: Mario Krenn, Alston Lo, and many other contributors
		Author-email: mario.krenn@utoronto.ca, alan@aspuru.com
		@@ -16,20 +16,24 @@ License: UNKNOWN
		[![GitHub issues](https://img.shields.io/github/issues/aspuru-guzik-group/selfies.svg)](https://GitHub.com/aspuru-guzik-group/selfies/issues/)
		[![Documentation Status](https://readthedocs.org/projects/selfies/badge/?version=latest)](http://selfies.readthedocs.io/?badge=latest)
		[![Documentation Status](https://readthedocs.org/projects/selfiesv2/badge/?version=latest)](http://selfiesv2.readthedocs.io/?badge=latest)
		[![GitHub contributors](https://img.shields.io/github/contributors/aspuru-guzik-group/selfies.svg)](https://GitHub.com/aspuru-guzik-group/selfies/graphs/contributors/)


		Self-Referencing Embedded Strings (SELFIES): A 100% robust molecular string representation<br>
		_Mario Krenn, Florian Haese, AkshatKumar Nigam, Pascal Friederich, Alan Aspuru-Guzik_<br>
		[Machine Learning: Science and Technology 1, 045024 (2020)](https://iopscience.iop.org/article/10.1088/2632-2153/aba947), [extensive blog post January 2021](https://aspuru.substack.com/p/molecular-graph-representations-and).<br>
		[Talk on youtube about SELFIES](https://www.youtube.com/watch?v=CaIyUmfGXDk).<br>
		Major contributors since v1.0.0: _[Alston Lo](https://github.com/aspuru-guzik-group/selfies/commits?author=alstonlo) and [Seyone Chithrananda](https://github.com/seyonechithrananda)_<br>
		Self-Referencing Embedded Strings (SELFIES): A 100% robust molecular string representation\
		_Mario Krenn, Florian Haese, AkshatKumar Nigam, Pascal Friederich, Alan Aspuru-Guzik_\
		[Machine Learning: Science and Technology 1, 045024 (2020)](https://iopscience.iop.org/article/10.1088/2632-2153/aba947), [extensive blog post January 2021](https://aspuru.substack.com/p/molecular-graph-representations-and).\
		[Talk on youtube about SELFIES](https://www.youtube.com/watch?v=CaIyUmfGXDk).\
		[Blog explaining SELFIES in Japanese language](https://blacktanktop.hatenablog.com/entry/2021/08/12/115613)\
		Major contributors since v1.0.0: _[Alston Lo](https://github.com/alstonlo) and [Seyone Chithrananda](https://github.com/seyonechithrananda)_\
		Chemistry Advisor: [Robert Pollice](https://scholar.google.at/citations?user=JR2N3JIAAAAJ)

		A main objective is to use SELFIES as direct input into machine learning models,<br>
		in particular in generative models, for the generation of molecular graphs<br>
		---

		A main objective is to use SELFIES as direct input into machine learning models,
		in particular in generative models, for the generation of molecular graphs
		which are syntactically and semantically valid.

		<center><img src="https://github.com/aspuru-guzik-group/selfies/blob/master/examples/VAE_LS_Validity.png" alt="SELFIES validity in a VAE latent space" width="666px"></center>
		<p align="center">
		<img src="https://github.com/aspuru-guzik-group/selfies/blob/master/examples/VAE_LS_Validity.png" alt="SELFIES validity in a VAE latent space" width="666px">
		</p>


		## Installation
		@@ -52,3 +56,3 @@ Use pip to install ``selfies``.
		[CHANGELOG](https://github.com/aspuru-guzik-group/selfies/blob/master/CHANGELOG.md)
		to review the changes between versions of `selfies`:
		to review the changes between versions of `selfies`, before upgrading:

		@@ -59,28 +63,23 @@ ```bash

		## Documentation

		The documentation can be found on
		[ReadTheDocs](https://selfies.readthedocs.io/en/latest/).
		Alternatively, it can be built from the ``docs/`` directory.

		## Usage

		### Standard Functions
		### Overview

		The ``selfies`` library has eight standard functions:
		Please refer to the [documentation](https://selfiesv2.readthedocs.io/en/latest/),
		which contains a thorough tutorial for getting started with ``selfies``
		and detailed descriptions of the functions
		that ``selfies`` provides. We summarize some key functions below.

		\| Function \| Description \|
		\| -------- \| ----------- \|
		\| ``selfies.encoder`` \| Translates a SMILES into an equivalent SELFIES. \|
		\| ``selfies.decoder`` \| Translates a SELFIES into an equivalent SMILES. \|
		\| ``selfies.len_selfies`` \| Returns the (symbol) length of a SELFIES. \|
		\| ``selfies.split_selfies`` \| Splits a SELFIES into its symbols. \|
		\| ``selfies.get_alphabet_from_selfies`` \| Builds an alphabet of SELFIES symbols from an iterable of SELFIES. \|
		\| ``selfies.get_semantic_robust_alphabet`` \| Returns a subset of all SELFIES symbols that are semantically constrained. \|
		\| ``selfies.selfies_to_encoding`` \| Converts a SELFIES into a label and/or one-hot encoding. \|
		\| ``selfies.encoding_to_selfies`` \| Converts a label or one-hot encoding into a SELFIES. \|
		\| ``selfies.encoder`` \| Translates a SMILES string into its corresponding SELFIES string. \|
		\| ``selfies.decoder`` \| Translates a SELFIES string into its corresponding SMILES string. \|
		\| ``selfies.set_semantic_constraints`` \| Configures the semantic constraints that ``selfies`` operates on. \|
		\| ``selfies.len_selfies`` \| Returns the number of symbols in a SELFIES string. \|
		\| ``selfies.split_selfies`` \| Tokenizes a SELFIES string into its individual symbols. \|
		\| ``selfies.get_alphabet_from_selfies`` \| Constructs an alphabet from an iterable of SELFIES strings. \|
		\| ``selfies.selfies_to_encoding`` \| Converts a SELFIES string into its label and/or one-hot encoding. \|
		\| ``selfies.encoding_to_selfies`` \| Converts a label or one-hot encoding into a SELFIES string. \|

		Please read the documentation for more detailed descriptions of these
		functions, and to view the advanced functions, which allow users to
		customize the SELFIES language.

		@@ -96,19 +95,41 @@ ### Examples

		# SMILES --> SELFIES translation
		encoded_selfies = sf.encoder(benzene) # '[C][=C][C][=C][C][=C][Ring1][Branch1_2]'
		# SMILES -> SELFIES -> SMILES translation
		try:
		benzene_sf = sf.encoder(benzene) # [C][=C][C][=C][C][=C][Ring1][=Branch1]
		benzene_smi = sf.decoder(benzene_sf) # C1=CC=CC=C1
		except sf.EncoderError:
		pass # sf.encoder error!
		except sf.DecoderError:
		pass # sf.decoder error!

		# SELFIES --> SMILES translation
		decoded_smiles = sf.decoder(encoded_selfies) # 'C1=CC=CC=C1'
		len_benzene = sf.len_selfies(benzene_sf) # 8

		len_benzene = sf.len_selfies(encoded_selfies) # 8
		symbols_benzene = list(sf.split_selfies(benzene_sf))
		# ['[C]', '[=C]', '[C]', '[=C]', '[C]', '[=C]', '[Ring1]', '[=Branch1]']
		```

		symbols_benzene = list(sf.split_selfies(encoded_selfies))
		# ['[C]', '[=C]', '[C]', '[=C]', '[C]', '[=C]', '[Ring1]', '[Branch1_2]']
		#### Customizing SELFIES:

		In this example, we relax the semantic constraints of ``selfies`` to allow
		for hypervalences (caution: hypervalence rules are much less understood
		than octet rules. Some molecules containing hypervalences are important,
		but generally, it is not known which molecules are stable and reasonable).

		```python
		import selfies as sf

		hypervalent_sf = sf.encoder('O=I(O)(O)(O)(O)O', strict=False) # orthoperiodic acid
		standard_derived_smi = sf.decoder(hypervalent_sf)
		# OI (the default constraints for I allows for only 1 bond)

		sf.set_semantic_constraints("hypervalent")
		relaxed_derived_smi = sf.decoder(hypervalent_sf)
		# O=I(O)(O)(O)(O)O (the hypervalent constraints for I allows for 7 bonds)
		```

		#### Integer and one-hot encoding SELFIES:
		In this example we first build an alphabet
		from a dataset of SELFIES, and then convert a SELFIES into a
		padded, label-encoded representation. Note that we use the
		``'[nop]'`` ([no operation](https://en.wikipedia.org/wiki/NOP_(code) ))

		In this example, we first build an alphabet from a dataset of SELFIES strings,
		and then convert a SELFIES string into its padded encoding. Note that we use the
		``[nop]`` ([no operation](https://en.wikipedia.org/wiki/NOP_(code) ))
		symbol to pad our SELFIES, which is a special SELFIES symbol that is always
		@@ -121,7 +142,6 @@ ignored and skipped over by ``selfies.decoder``, making it a useful

		dataset = ['[C][O][C]', '[F][C][F]', '[O][=O]', '[C][C][O][C][C]']
		dataset = ["[C][O][C]", "[F][C][F]", "[O][=O]", "[C][C][O][C][C]"]
		alphabet = sf.get_alphabet_from_selfies(dataset)
		alphabet.add('[nop]') # '[nop]' is a special padding symbol
		alphabet = list(sorted(alphabet))
		print(alphabet) # ['[=O]', '[C]', '[F]', '[O]', '[nop]']
		alphabet.add("[nop]") # [nop] is a special padding symbol
		alphabet = list(sorted(alphabet)) # ['[=O]', '[C]', '[F]', '[O]', '[nop]']

		@@ -131,19 +151,15 @@ pad_to_len = max(sf.len_selfies(s) for s in dataset) # 5

		# SELFIES to label encode
		dimethyl_ether = dataset[0] # '[C][O][C]'
		dimethyl_ether = dataset[0] # [C][O][C]

		# [1, 3, 1, 4, 4]
		print(sf.selfies_to_encoding(dimethyl_ether,
		vocab_stoi=symbol_to_idx,
		pad_to_len=pad_to_len,
		enc_type='label'))

		# [[0, 1, 0, 0, 0], [0, 0, 0, 1, 0], [0, 1, 0, 0, 0], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1]]
		print(sf.selfies_to_encoding(dimethyl_ether,
		vocab_stoi=symbol_to_idx,
		pad_to_len=pad_to_len,
		enc_type='one_hot'))
		label, one_hot = sf.selfies_to_encoding(
		selfies=dimethyl_ether,
		vocab_stoi=symbol_to_idx,
		pad_to_len=pad_to_len,
		enc_type="both"
		)
		# label = [1, 3, 1, 4, 4]
		# one_hot = [[0, 1, 0, 0, 0], [0, 0, 0, 1, 0], [0, 1, 0, 0, 0], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1]]
		```

		### More Examples
		### More Usages and Examples

		@@ -158,19 +174,6 @@ * More examples can be found in the ``examples/`` directory, including a
		* Kohulan Rajan, Achim Zielesny, Christoph Steinbeck show in two papers that SELFIES outperforms other representations in [img2string](https://link.springer.com/article/10.1186/s13321-020-00469-w) and [string2string](https://chemrxiv.org/articles/preprint/STOUT_SMILES_to_IUPAC_Names_Using_Neural_Machine_Translation/13469202/1) translation tasks, see the codes of [DECIMER](https://github.com/Kohulan/DECIMER-Image-to-SMILES) and [STOUT](https://github.com/Kohulan/Smiles-TO-iUpac-Translator).
		* An improvement to the old genetic algorithm, the authors have also released [JANUS](https://arxiv.org/abs/2106.04011), which allows for more efficient optimization in the chemical space. JANUS makes use of [STONED-SELFIES](https://pubs.rsc.org/en/content/articlepdf/2021/sc/d1sc00231g) and a neural network for efficient sampling.


		## Handling invalid inputs
		If an invalid input is presented to the encoder or decoder, the return value is `None`.
		The error can be analysed by using the `encoder(...,print_error=True)` option.
		```python
		import selfies as sf
		invalid_smiles="C[C@H](O)[C@@(*)C1=CC=CC=C1"
		selfies_string=sf.encoder(invalid_smiles)

		if selfies_string==None:
		selfies_string=sf.encoder(invalid_smiles,print_error=True)
		# 'Encoding error 'C[C@H](O)[C@@()C1=CC=CC=C1': wildcard atom '' not supported.'
		```

		## Tests
		SELFIES uses `pytest` with `tox` as its testing framework.
		`selfies` uses `pytest` with `tox` as its testing framework.
		All tests can be found in the `tests/` directory. To run the test suite for
		@@ -180,29 +183,17 @@ SELFIES, install ``tox`` and run:
		```bash
		tox
		tox -- --trials=10000 --dataset_samples=10000
		```

		By default, SELFIES is tested against a random subset
		(of size ``dataset_samples=100000``) on various datasets:
		By default, `selfies` is tested against a random subset
		(of size ``dataset_samples=10000``) on various datasets:

		* 130K molecules from [QM9](https://www.nature.com/articles/sdata201422)
		* 250K molecules from [ZINC](https://en.wikipedia.org/wiki/ZINC_database)
		* 50K molecules from [non-fullerene acceptors for organic solar cells](https://www.sciencedirect.com/science/article/pii/S2542435117301307)
		* 8K molecules from [Tox21](http://moleculenet.ai/datasets-1) in MoleculeNet
		* 93K molecules from PubChem [MUV](http://moleculenet.ai/datasets-1) in MoleculeNet
		* 27M molecules from the [eMolecules Plus Database](https://www.emolecules.com/info/plus/download-database).
		* 50K molecules from a dataset of [non-fullerene acceptors for organic solar cells](https://www.sciencedirect.com/science/article/pii/S2542435117301307)
		* 160K+ molecules from various [MoleculeNet](http://moleculenet.ai/datasets-1) datasets
		* 36M+ molecules from the [eMolecules Database](https://www.emolecules.com/info/products-data-downloads.html).
		Due to its large size, this dataset is not included on the repository. To run tests
		on it, please download the dataset in the ``tests/test_sets`` directory
		and enable its pytest at ``tests/test_on_emolecules.py``.
		on it, please download the dataset into the ``tests/test_sets`` directory
		and run the ``tests/run_on_large_dataset.py`` script.

		Other tests are random and repeated ``trials`` number of times.
		These can be specified as arguments

		```bash
		tox -- --trials 100 --dataset_samples 100
		```

		where ``--trials=100000`` and ``--dataset_samples=100000`` by default. Note that
		if ``dataset_samples`` is negative or exceeds the length of the dataset,
		the whole dataset is used.

		## Version History
		@@ -213,5 +204,5 @@ See [CHANGELOG](https://github.com/aspuru-guzik-group/selfies/blob/master/CHANGELOG.md).

		We thank Jacques Boitreaud, Andrew Brereton, Matthew Carbone (x94carbone), Nathan Frey (ncfrey), Theophile Gaudin,
		HelloJocelynLu, Hyunmin Kim (hmkim), Minjie Li, Vincent Mallet, Alexander Minidis (DocMinus), Kevin Ryan (LeanAndMean),
		Benjamin Sanchez-Lengeling, and Zhenpeng Yao for their suggestions and bug reports,
		We thank Jacques Boitreaud, Andrew Brereton, Nessa Carson (supersciencegrl), Matthew Carbone (x94carbone), Vladimir Chupakhin (chupvl), Nathan Frey (ncfrey), Theophile Gaudin,
		HelloJocelynLu, Hyunmin Kim (hmkim), Minjie Li, Vincent Mallet, Alexander Minidis (DocMinus), Kohulan Rajan (Kohulan),
		Kevin Ryan (LeanAndMean), Benjamin Sanchez-Lengeling, Andrew White, Zhenpeng Yao and Adamo Young for their suggestions and bug reports,
		and Robert Pollice for chemistry advices.
		@@ -218,0 +209,0 @@

+12

-3

selfies.egg-info/SOURCES.txt

		README.md
		setup.py
		selfies/__init__.py
		selfies/bond_constraints.py
		selfies/compatibility.py
		selfies/constants.py
		selfies/decoder.py
		selfies/encoder.py
		selfies/exceptions.py
		selfies/grammar_rules.py
		selfies/kekulize.py
		selfies/utils.py
		selfies/mol_graph.py
		selfies.egg-info/PKG-INFO
		selfies.egg-info/SOURCES.txt
		selfies.egg-info/dependency_links.txt
		selfies.egg-info/top_level.txt
		selfies.egg-info/top_level.txt
		selfies/utils/__init__.py
		selfies/utils/encoding_utils.py
		selfies/utils/linked_list.py
		selfies/utils/matching_utils.py
		selfies/utils/selfies_utils.py
		selfies/utils/smiles_utils.py

+21

-20

selfies/__init__.py

		@@ -18,7 +18,7 @@ #!/usr/bin/env python
		Typical usage example:
		import selfies
		import selfies as sf

		benzene = "C1=CC=CC=C1"
		selfies_benzene = selfies.encoder(benzene)
		smiles_benzene = selfies.decoder(selfies_benzene)
		benzene_selfies = sf.encoder(benzene)
		benzene_smiles = sf.decoder(benzene_selfies)

		@@ -29,3 +29,3 @@ For comments, bug reports or feature ideas, please send an email to

		__version__ = "1.0.3"
		__version__ = "2.0.0"

		@@ -35,6 +35,4 @@ __all__ = [
		"decoder",
		"get_preset_constraints",
		"get_semantic_robust_alphabet",
		"get_default_constraints",
		"get_octet_rule_constraints",
		"get_hypervalent_constraints",
		"get_semantic_constraints",
		@@ -49,22 +47,25 @@ "set_semantic_constraints",
		"batch_flat_hot_to_selfies",
		"EncoderError",
		"DecoderError"
		]

		from .bond_constraints import (
		get_preset_constraints,
		get_semantic_constraints,
		get_semantic_robust_alphabet,
		set_semantic_constraints
		)
		from .decoder import decoder
		from .encoder import encoder
		from .grammar_rules import (
		get_semantic_robust_alphabet,
		get_default_constraints,
		get_octet_rule_constraints,
		get_hypervalent_constraints,
		get_semantic_constraints,
		set_semantic_constraints,
		from .exceptions import DecoderError, EncoderError
		from .utils.encoding_utils import (
		batch_flat_hot_to_selfies,
		batch_selfies_to_flat_hot,
		encoding_to_selfies,
		selfies_to_encoding
		)
		from .utils import (
		from .utils.selfies_utils import (
		get_alphabet_from_selfies,
		len_selfies,
		split_selfies,
		selfies_to_encoding,
		batch_selfies_to_flat_hot,
		encoding_to_selfies,
		batch_flat_hot_to_selfies,
		split_selfies
		)

+166

-317

selfies/decoder.py

		@@ -1,372 +0,221 @@
		from collections import OrderedDict
		from typing import Dict, Iterable, List, Optional, Tuple, Union
		import warnings

		from selfies.grammar_rules import (get_bond_from_num,
		get_hypervalent_constraints,
		get_n_from_symbols, get_next_branch_state,
		get_next_state, get_num_from_bond,
		get_octet_rule_constraints,
		get_semantic_constraints,
		set_semantic_constraints)
		from selfies.compatibility import modernize_symbol
		from selfies.exceptions import DecoderError
		from selfies.grammar_rules import (
		get_index_from_selfies,
		next_atom_state,
		next_branch_state,
		next_ring_state,
		process_atom_symbol,
		process_branch_symbol,
		process_ring_symbol
		)
		from selfies.mol_graph import MolecularGraph
		from selfies.utils.selfies_utils import split_selfies
		from selfies.utils.smiles_utils import mol_to_smiles


		def decoder(selfies: str,
		print_error: bool = False,
		constraints: Optional[str] = None) -> Optional[str]:
		"""Translates a SELFIES into a SMILES.
		def decoder(selfies: str, compatible: bool = False) -> str:
		"""Translates a SELFIES string into its corresponding SMILES string.

		The SELFIES to SMILES translation operates based on the :mod:`selfies`
		grammar rules, which can be configured using
		:func:`selfies.set_semantic_constraints`. Given the appropriate settings,
		the decoded SMILES will always be syntactically and semantically correct.
		That is, the output SMILES will satisfy the specified bond constraints.
		Additionally, :func:`selfies.decoder` will attempt to preserve the
		atom and branch order of the input SELFIES.
		This translation is deterministic but depends on the current semantic
		constraints. The output SMILES string is guaranteed to be syntatically
		correct and guaranteed to represent a molecule that obeys the
		semantic constraints.

		:param selfies: the SELFIES to be translated.
		:param print_error: if True, error messages will be printed to console.
		Defaults to False.
		:param constraints: if ``'octet_rule'`` or ``'hypervalent'``,
		the corresponding preset bond constraints will be used instead.
		If ``None``, :func:`selfies.decoder` will use the
		currently configured bond constraints. Defaults to ``None``.
		:return: the SMILES translation of ``selfies``. If an error occurs,
		and ``selfies`` cannot be translated, ``None`` is returned instead.
		:param selfies: the SELFIES string to be translated.
		:param compatible: if ``True``, this function will accept SELFIES strings
		containing depreciated symbols from previous releases. However, this
		function may behave differently than in previous major relases,
		and should not be treated as backard compatible.
		Defaults to ``False``.
		:return: a SMILES string derived from the input SELFIES string.
		:raises DecoderError: if the input SELFIES string is malformed.

		:Example:

		>>> import selfies
		>>> selfies.decoder('[C][=C][F]')
		>>> import selfies as sf
		>>> sf.decoder('[C][=C][F]')
		'C=CF'

		.. seealso:: The
		`"octet_rule" <https://en.wikipedia.org/wiki/Octet_rule>`_
		and
		`"hypervalent" <https://en.wikipedia.org/wiki/Hypervalent_molecule>`_
		preset bond constraints
		can be viewed with :func:`selfies.get_octet_rule_constraints` and
		:func:`selfies.get_hypervalent_constraints`, respectively. These
		presets are variants of the "default" bond constraints, which can
		be viewed with :func:`selfies.get_default_constraints`. Their
		differences can be summarized as follows:

		* def. : ``Cl``, ``Br``, ``I``: 1, ``N``: 3, ``P``: 5, ``P+1``: 6, ``P-1``: 4, ``S``: 6, ``S+1``: 7, ``S-1``: 5
		* oct. : ``Cl``, ``Br``, ``I``: 1, ``N``: 3, ``P``: 3, ``P+1``: 4, ``P-1``: 2, ``S``: 2, ``S+1``: 3, ``S-1``: 1
		* hyp. : ``Cl``, ``Br``, ``I``: 7, ``N``: 5, ``P``: 5, ``P+1``: 6, ``P-1``: 4, ``S``: 6, ``S+1``: 7, ``S-1``: 5
		"""

		old_constraints = get_semantic_constraints()
		if constraints is None:
		pass
		elif constraints == 'octet_rule':
		set_semantic_constraints(get_octet_rule_constraints())
		elif constraints == 'hypervalent':
		set_semantic_constraints(get_hypervalent_constraints())
		else:
		raise ValueError("unrecognized constraint type")
		if compatible:
		msg = "\nselfies.decoder() may behave differently than in previous " \
		"major releases. We recommend using SELFIES that are up to date."
		warnings.warn(msg, stacklevel=2)

		try:
		all_smiles = [] # process dot-separated fragments separately
		mol = MolecularGraph()

		for s in selfies.split("."):
		smiles = _translate_selfies(s)
		rings = []
		for s in selfies.split("."):
		_derive_mol_from_symbols(
		symbol_iter=_tokenize_selfies(s, compatible),
		mol=mol,
		selfies=selfies,
		max_derive=float("inf"),
		init_state=0,
		root_atom=None,
		rings=rings
		)
		_form_rings_bilocally(mol, rings)
		return mol_to_smiles(mol)

		if smiles != "": # prevent malformed dots (e.g. [C]..[C], .[C][C])
		all_smiles.append(smiles)

		if constraints is not None: # restore old constraints
		set_semantic_constraints(old_constraints)
		def _tokenize_selfies(selfies, compatible):
		if isinstance(selfies, str):
		symbol_iter = split_selfies(selfies)
		elif isinstance(selfies, list):
		symbol_iter = selfies
		else:
		raise ValueError() # should not happen

		return '.'.join(all_smiles)

		try:
		for symbol in symbol_iter:
		if symbol == "[nop]":
		continue
		if compatible:
		symbol = modernize_symbol(symbol)
		yield symbol
		except ValueError as err:
		if constraints is not None: # restore old constraints
		set_semantic_constraints(old_constraints)
		raise DecoderError(str(err)) from None

		if print_error:
		print("Decoding error '{}': {}.".format(selfies, err))
		return None


		def _parse_selfies(selfies: str) -> Iterable[str]:
		"""Parses a SELFIES into its symbols.

		A generator, which parses a SELFIES and yields its symbols
		one-by-one. When no symbols are left in the SELFIES, the empty
		string is infinitely yielded. As a precondition, the input SELFIES contains
		no dots, so all symbols are enclosed by square brackets, e.g. [X].

		:param selfies: the SElFIES string to be parsed.
		:return: an iterable of the symbols of the SELFIES.
		"""

		left_idx = selfies.find('[')

		while 0 <= left_idx < len(selfies):
		right_idx = selfies.find(']', left_idx + 1)

		if (selfies[left_idx] != '[') or (right_idx == -1):
		raise ValueError("malformed SELIFES, "
		"misplaced or missing brackets")

		next_symbol = selfies[left_idx: right_idx + 1]
		left_idx = right_idx + 1

		if next_symbol != '[nop]': # skip [nop]
		yield next_symbol

		while True: # no more symbols left
		yield ''


		def _parse_selfies_symbols(selfies_symbols: List[str]) -> Iterable[str]:
		"""Equivalent to ``_parse_selfies``, except the input SELFIES is presented
		as a list of SELFIES symbols, as opposed to a string.

		:param selfies_symbols: a SELFIES represented as a list of SELFIES symbols.
		:return: an iterable of the symbols of the SELFIES.
		"""
		for symbol in selfies_symbols:

		if symbol != '[nop]':
		yield symbol

		while True:
		yield ''


		def _translate_selfies(selfies: str) -> str:
		"""A helper for ``selfies.decoder``, which translates a SELFIES into a
		SMILES (assuming the input SELFIES contains no dots).

		:param selfies: the SELFIES to be translated.
		:return: the SMILES translation of the SELFIES.
		"""

		selfies_gen = _parse_selfies(selfies)

		# derived[i] is a list with three elements:
		# (1) a string representing the i-th derived atom, and its connecting
		# bond (e.g. =C, #N, N, C are all possible)
		# (2) the number of available bonds the i-th atom has to make
		# (3) the index of the previously derived atom that the i-th derived
		# atom is bonded to
		# Example: if the 6-th derived atom was 'C', had 2 available bonds,
		# and was connected to the 5-th derived atom by a double bond, then
		# derived[6] = ['=C', 2, 5]
		derived = []

		# each item of <branches> is a key-value pair of indices that represents
		# the branches to be made. If a branch starts at the i-th derived atom
		# and ends at the j-th derived atom, then branches[i] = j. No two
		# branches should start at the same atom, e.g. C((C)Cl)C
		branches = {}

		# each element of <rings> is a tuple of size three that represents the
		# rings to be made, in the same order they appear in the SELFIES (left
		# to right). If the i-th ring is between the j-th and k-th derived atoms
		# (j <= k) and has bond symbol s ('=', '#', '\', etc.), then
		# rings[i] = (j, k, s).
		rings = []

		_translate_selfies_derive(selfies_gen, 0, derived, -1, branches, rings)
		_form_rings_bilocally(derived, rings)

		# create branches
		for lb, rb in branches.items():
		derived[lb][0] = '(' + derived[lb][0]
		derived[rb][0] += ')'

		smiles = ""
		for s, _, _ in derived: # construct SMILES from <derived>
		smiles += s
		return smiles


		# flake8: noqa: C901
		# noinspection PyTypeChecker
		def _translate_selfies_derive(selfies_gen: Iterable[str],
		init_state: int,
		derived: List[List[Union[str, int]]],
		prev_idx: int,
		branches: Dict[int, int],
		rings: List[Tuple[int, int, str]]) -> None:
		"""Recursive helper for _translate_selfies.

		Derives the SMILES symbols one-by-one from a SELFIES, and
		populates derived, branches, and rings. The main chain and side branches
		of the SELFIES are translated recursively. Rings are not actually
		translated, but saved to the rings list to be added later.

		:param selfies_gen: an iterable of the symbols of the SELFIES to be
		translated, created by ``_parse_selfies``.
		:param init_state: the initial derivation state.
		:param derived: see ``derived`` in ``_translate_selfies``.
		:param prev_idx: the index of the previously derived atom, or -1,
		if no atoms have been derived yet.
		:param branches: see ``branches`` in ``_translate_selfies``.
		:param rings: see ``rings`` in ``_translate_selfies``.
		:return: ``None``.
		"""

		curr_symbol = next(selfies_gen)
		def _derive_mol_from_symbols(
		symbol_iter, mol, selfies, max_derive,
		init_state, root_atom, rings
		):
		n_derived = 0
		state = init_state
		prev_atom = root_atom

		while curr_symbol != '' and state >= 0:
		while (state is not None) and (n_derived < max_derive):

		# Case 1: Branch symbol (e.g. [Branch1_2])
		if 'Branch' in curr_symbol:
		try: # retrieve next symbol
		symbol = next(symbol_iter)
		n_derived += 1
		except StopIteration:
		break

		branch_init_state, new_state = \
		get_next_branch_state(curr_symbol, state)
		# Case 1: Branch symbol (e.g. [Branch1])
		if "ch" == symbol[-4:-2]:

		if state <= 1: # state = 0, 1
		pass # ignore no symbols
		output = process_branch_symbol(symbol)
		if output is None:
		_raise_decoder_error(selfies, symbol)
		btype, n = output

		if state <= 1:
		next_state = state
		else:
		L = int(curr_symbol[-4]) # corresponds to [BranchL_X]
		L_symbols = []
		for _ in range(L):
		L_symbols.append(next(selfies_gen))
		binit_state, next_state = next_branch_state(btype, state)

		N = get_n_from_symbols(*L_symbols)
		Q = _read_index_from_selfies(symbol_iter, n_symbols=n)
		n_derived += n + _derive_mol_from_symbols(
		symbol_iter, mol, selfies, (Q + 1),
		init_state=binit_state, root_atom=prev_atom, rings=rings
		)

		branch_symbols = []
		for _ in range(N + 1):
		branch_symbols.append(next(selfies_gen))
		branch_gen = _parse_selfies_symbols(branch_symbols)

		branch_start = len(derived)
		_translate_selfies_derive(branch_gen, branch_init_state,
		derived, prev_idx, branches, rings)
		branch_end = len(derived) - 1

		# resolve C((C)Cl)C --> C(C)(Cl)C
		while branch_start in branches:
		branch_start = branches[branch_start] + 1

		# finally, register the branch in branches
		if branch_start <= branch_end:
		branches[branch_start] = branch_end

		# Case 2: Ring symbol (e.g. [Ring2])
		elif 'Ring' in curr_symbol:
		elif "ng" == symbol[-4:-2]:

		new_state = state
		output = process_ring_symbol(symbol)
		if output is None:
		_raise_decoder_error(selfies, symbol)
		ring_type, n, stereo = output

		if state == 0:
		pass # ignore no symbols

		next_state = state
		else:
		L = int(curr_symbol[-2]) # corresponds to [RingL]
		L_symbols = []
		for _ in range(L):
		L_symbols.append(next(selfies_gen))
		ring_order, next_state = next_ring_state(ring_type, state)
		bond_info = (ring_order, stereo)

		N = get_n_from_symbols(*L_symbols)
		Q = _read_index_from_selfies(symbol_iter, n_symbols=n)
		n_derived += n
		lidx = max(0, prev_atom.index - (Q + 1))
		rings.append((mol.get_atom(lidx), prev_atom, bond_info))

		left_idx = max(0, prev_idx - (N + 1))
		right_idx = prev_idx
		# Case 3: [epsilon]
		elif "eps" in symbol:
		next_state = 0 if (state == 0) else None

		bond_symbol = ''
		if curr_symbol[1:5] == 'Expl':
		bond_symbol = curr_symbol[5]

		rings.append((left_idx, right_idx, bond_symbol))

		# Case 3: regular symbol (e.g. [N], [=C], [F])
		# Case 4: regular symbol (e.g. [N], [=C], [F])
		else:
		new_symbol, new_state = get_next_state(curr_symbol, state)

		if new_symbol != '': # in case of [epsilon]
		derived.append([new_symbol, new_state, prev_idx])
		output = process_atom_symbol(symbol)
		if output is None:
		_raise_decoder_error(selfies, symbol)
		(bond_order, stereo), atom = output
		cap = atom.bonding_capacity

		if prev_idx >= 0:
		bond_num = get_num_from_bond(new_symbol[0])
		derived[prev_idx][1] -= bond_num
		bond_order, next_state = next_atom_state(bond_order, cap, state)
		if bond_order == 0:
		if state == 0:
		mol.add_atom(atom, True)
		else:
		mol.add_atom(atom)
		src, dst = prev_atom.index, atom.index
		mol.add_bond(src=src, dst=dst, order=bond_order, stereo=stereo)
		prev_atom = atom

		prev_idx = len(derived) - 1
		if next_state is None:
		break
		state = next_state

		curr_symbol = next(selfies_gen) # update symbol and state
		state = new_state
		while n_derived < max_derive: # consume remaining tokens
		try:
		next(symbol_iter)
		n_derived += 1
		except StopIteration:
		break

		return n_derived

		def _form_rings_bilocally(derived: List[List[Union[str, int]]],
		rings: List[Tuple[int, int, str]]) -> None:
		"""Forms all the rings specified by the rings list, in first-to-last order,
		by updating derived.

		:param derived: see ``derived`` in ``_translate_selfies``.
		:param rings: see ``rings`` in ``_translate_selfies``.
		:return: ``None``.
		"""
		def _raise_decoder_error(selfies, invalid_symbol):
		err_msg = "invalid symbol '{}'\n\tSELFIES: {}".format(
		invalid_symbol, selfies
		)
		raise DecoderError(err_msg)

		# due to the behaviour of allowing multiple rings between the same atom
		# pair, or rings between already bonded atoms, we first resolve all rings
		# so that only valid rings are left and placed into <ring_locs>.
		ring_locs = OrderedDict()

		for left_idx, right_idx, bond_symbol in rings:
		def _read_index_from_selfies(symbol_iter, n_symbols):
		index_symbols = []
		for _ in range(n_symbols):
		try:
		index_symbols.append(next(symbol_iter))
		except StopIteration:
		index_symbols.append(None)
		return get_index_from_selfies(*index_symbols)

		if left_idx == right_idx: # ring to the same atom forbidden
		continue

		left_end = derived[left_idx]
		right_end = derived[right_idx]
		bond_num = get_num_from_bond(bond_symbol)
		def _form_rings_bilocally(mol, rings):
		rings_made = [0] * len(mol)

		if left_end[1] <= 0 or right_end[1] <= 0:
		continue # no room for bond
		for latom, ratom, bond_info in rings:
		lidx, ridx = latom.index, ratom.index

		if bond_num > min(left_end[1], right_end[1]):
		bond_num = min(left_end[1], right_end[1])
		bond_symbol = get_bond_from_num(bond_num)
		if lidx == ridx: # ring to the same atom forbidden
		continue

		# ring is formed between two atoms that are already bonded
		# e.g. CC1C1C --> CC=CC
		if left_idx == right_end[2]:
		order, (lstereo, rstereo) = bond_info
		lfree = latom.bonding_capacity - mol.get_bond_count(lidx)
		rfree = ratom.bonding_capacity - mol.get_bond_count(ridx)

		right_symbol = right_end[0]
		if lfree <= 0 or rfree <= 0:
		continue # no room for ring bond
		order = min(order, lfree, rfree)

		if right_symbol[0] in {'-', '/', '\\', '=', '#'}:
		old_bond = right_symbol[0]
		else:
		old_bond = ''
		if mol.has_bond(a=lidx, b=ridx):
		bond = mol.get_dirbond(src=lidx, dst=ridx)
		new_order = min(order + bond.order, 3)
		mol.update_bond_order(a=lidx, b=ridx, new_order=new_order)

		# update bond multiplicity and symbol
		new_bond_num = min(bond_num + get_num_from_bond(old_bond), 3)
		new_bond_symbol = get_bond_from_num(new_bond_num)

		right_end[0] = new_bond_symbol + right_end[0][len(old_bond):]

		# ring is formed between two atoms that are not bonded, e.g. C1CC1C
		else:
		loc = (left_idx, right_idx)

		if loc in ring_locs:
		# a ring is formed between two atoms that are have previously
		# been bonded by a ring, so ring bond multiplicity is updated

		new_bond_num = min(bond_num
		+ get_num_from_bond(ring_locs[loc]), 3)
		new_bond_symbol = get_bond_from_num(new_bond_num)
		ring_locs[loc] = new_bond_symbol

		else:
		ring_locs[loc] = bond_symbol

		left_end[1] -= bond_num
		right_end[1] -= bond_num

		# finally, use <ring_locs> to add all the rings into <derived>

		ring_counter = 1
		for (left_idx, right_idx), bond_symbol in ring_locs.items():

		ring_id = str(ring_counter)
		if len(ring_id) == 2:
		ring_id = "%" + ring_id
		ring_counter += 1 # increment

		derived[left_idx][0] += bond_symbol + ring_id
		derived[right_idx][0] += bond_symbol + ring_id
		mol.add_ring_bond(
		a=lidx, a_stereo=lstereo, a_pos=rings_made[lidx],
		b=ridx, b_stereo=rstereo, b_pos=rings_made[ridx],
		order=order
		)
		rings_made[lidx] += 1
		rings_made[ridx] += 1

+155

-217

selfies/encoder.py

		@@ -1,265 +0,203 @@
		from typing import Dict, Iterable, List, Optional, Tuple
		from selfies.exceptions import EncoderError, SMILESParserError
		from selfies.grammar_rules import get_selfies_from_index
		from selfies.utils.linked_list import SinglyLinkedList
		from selfies.utils.smiles_utils import (
		atom_to_smiles,
		bond_to_smiles,
		smiles_to_mol
		)

		from selfies.grammar_rules import get_num_from_bond, get_symbols_from_n
		from selfies.kekulize import kekulize_parser

		def encoder(smiles: str, strict: bool = True) -> str:
		"""Translates a SMILES string into its corresponding SELFIES string.

		def encoder(smiles: str, print_error: bool = False) -> Optional[str]:
		"""Translates a SMILES into a SELFIES.
		This translation is deterministic and does not depend on the
		current semantic constraints. Additionally, it preserves the atom order
		of the input SMILES string; thus, one could generate randomized SELFIES
		strings by generating randomized SMILES strings, and then translating them.

		The SMILES to SELFIES translation occurs independently of the SELFIES
		alphabet and grammar. Thus, :func:`selfies.encoder` will work regardless of
		the alphabet and grammar rules that :py:mod:`selfies` is operating on,
		assuming the input is a valid SMILES. Additionally, :func:`selfies.encoder`
		preserves the atom and branch order of the input SMILES; thus, one
		could generate random SELFIES corresponding to the same molecule by
		generating random SMILES, and then translating them.
		By nature of SELFIES, it is impossible to represent molecules that
		violate the current semantic constraints as SELFIES strings.
		Thus, we provide the ``strict`` flag to guard against such cases. If
		``strict=True``, then this function will raise a
		:class:`selfies.EncoderError` if the input SMILES string represents
		a molecule that violates the semantic constraints. If
		``strict=False``, then this function will not raise any error; however,
		calling :func:`selfies.decoder` on a SELFIES string generated this
		way will not be guaranteed to recover a SMILES string representing
		the original molecule.

		However, encoding and then decoding a SMILES may not necessarily yield
		the original SMILES. Reasons include:
		:param smiles: the SMILES string to be translated. It is recommended to
		use RDKit to check that the strings passed into this function
		are valid SMILES strings.
		:param strict: if ``True``, this function will check that the
		input SMILES string obeys the semantic constraints.
		Defaults to ``True``.
		:return: a SELFIES string translated from the input SMILES string.
		:raises EncoderError: if the input SMILES string is invalid,
		cannot be kekulized, or violates the semantic constraints with
		``strict=True``.

		1. SMILES with aromatic symbols are automatically
		Kekulized before being translated.
		2. SMILES that violate the bond constraints specified by
		:mod:`selfies` will be successfully encoded by
		:func:`selfies.encoder`, but then decoded into a new molecule
		that satisfies the constraints.
		3. The exact ring numbering order is lost in :func:`selfies.encoder`,
		and cannot be reconstructed by :func:`selfies.decoder`.

		Finally, note that :func:`selfies.encoder` does not check if the input
		SMILES is valid, and should not be expected to reject invalid inputs.
		It is recommended to use RDKit to first verify that the SMILES are
		valid.

		:param smiles: the SMILES to be translated.
		:param print_error: if True, error messages will be printed to console.
		Defaults to False.
		:return: the SELFIES translation of ``smiles``. If an error occurs,
		and ``smiles`` cannot be translated, :code:`None` is returned instead.

		:Example:

		>>> import selfies
		>>> selfies.encoder('C=CF')
		>>> import selfies as sf
		>>> sf.encoder("C=CF")
		'[C][=C][F]'

		.. note:: Currently, :func:`selfies.encoder` does not support the
		following types of SMILES:
		.. note:: This function does not currently support SMILES with:

		* SMILES using ring numbering across a dot-bond symbol
		to specify bonds, e.g. ``C1.C2.C12`` (propane) or
		``c1cc([O-].[Na+])ccc1`` (sodium phenoxide).
		* SMILES with ring numbering between atoms that are over
		``16 ** 3 = 4096`` atoms apart.
		* SMILES using the wildcard symbol ``*``.
		* SMILES using chiral specifications other than ``@`` and ``@@``.
		* The wildcard symbol ``*``.
		* The quadruple bond symbol ``$``.
		* Chirality specifications other than ``@`` and ``@@``.
		* Ring bonds across a dot symbol (e.g. ``c1cc([O-].[Na+])ccc1``) or
		ring bonds between atoms that are over 4000 atoms apart.

		Although SELFIES does not have aromatic symbols, this function
		does support aromatic SMILES strings by internally kekulizing them
		before translation.
		"""

		try:
		if '*' in smiles:
		raise ValueError("wildcard atom '*' not supported")
		mol = smiles_to_mol(smiles)
		except SMILESParserError as err:
		err_msg = "failed to parse input\n\tSMILES: {}".format(smiles)
		raise EncoderError(err_msg) from err

		all_selfies = [] # process dot-separated fragments separately
		for s in smiles.split("."):
		all_selfies.append(_translate_smiles(s))
		return '.'.join(all_selfies)
		if not mol.kekulize():
		err_msg = "kekulization failed\n\tSMILES: {}".format(smiles)
		raise EncoderError(err_msg)

		except ValueError as err:
		if print_error:
		print("Encoding error '{}': {}.".format(smiles, err))
		return None
		if strict:
		_check_bond_constraints(mol, smiles)

		# invert chirality of atoms where necessary,
		# such that they are restored when the SELFIES is decoded
		for atom in mol.get_atoms():
		if ((atom.chirality is not None)
		and mol.has_out_ring_bond(atom.index)
		and _should_invert_chirality(mol, atom)):
		atom.invert_chirality()

		ATOM_TYPE = 1
		BRANCH_TYPE = 2
		RING_TYPE = 3
		fragments = []
		for root in mol.get_roots():
		derived = list(_fragment_to_selfies(mol, None, root))
		fragments.append("".join(derived))
		return ".".join(fragments)


		def _parse_smiles(smiles: str) -> Iterable[Tuple[str, str, int]]:
		"""Parses a SMILES into its symbols.
		def _check_bond_constraints(mol, smiles):
		errors = []

		A generator, which parses a SMILES string and returns its symbol(s)
		one-by-one as a tuple of:
		(1) the bond symbol connecting the current atom/ring/branch symbol
		to the previous atom/ring/branch symbol (e.g. '=', '', '#')
		(2) the atom/ring/branch symbol as a string (e.g. 'C', '12', '(')
		(3) the type of the symbol in (2), represented as an integer that is
		either ``ATOM_TYPE``, ``BRANCH_TYPE``, and ``RING_TYPE``.
		As a precondition, we also assume ``smiles`` has no dots in it.
		for atom in mol.get_atoms():
		bond_cap = atom.bonding_capacity
		bond_count = mol.get_bond_count(atom.index)
		if bond_count > bond_cap:
		errors.append((atom_to_smiles(atom), bond_count, bond_cap))

		:param smiles: the SMILES to be parsed.
		:return: an iterable of the symbol(s) of the SELFIES along with
		their types.
		"""
		if errors:
		err_msg = "input violates the currently-set semantic constraints\n" \
		"\tSMILES: {}\n" \
		"\tErrors:\n".format(smiles)
		for e in errors:
		err_msg += "\t[{:} with {} bond(s) - " \
		"a max. of {} bond(s) was specified]\n".format(*e)
		raise EncoderError(err_msg)

		i = 0

		while 0 <= i < len(smiles):
		def _should_invert_chirality(mol, atom):
		out_bonds = mol.get_out_dirbonds(atom.index)

		bond = ''

		if smiles[i] in {'-', '/', '\\', '=', '#', ":"}:
		bond = smiles[i]
		i += 1

		if smiles[i].isalpha(): # organic subset elements
		if smiles[i: i + 2] in ('Br', 'Cl'): # two letter elements
		symbol = smiles[i: i + 2]
		symbol_type = ATOM_TYPE
		i += 2
		else:
		symbol = smiles[i] # one letter elements (e.g. C, N, ...)
		symbol_type = ATOM_TYPE
		i += 1

		elif smiles[i] in ('(', ')'): # open and closed branch brackets
		bond = smiles[i + 1: i + 2]
		symbol = smiles[i]
		symbol_type = BRANCH_TYPE
		i += 1

		elif smiles[i] == '[': # atoms encased in brackets (e.g. [NH])
		r_idx = smiles.find(']', i + 1)
		symbol = smiles[i: r_idx + 1]
		symbol_type = ATOM_TYPE
		i = r_idx + 1

		if r_idx == -1:
		raise ValueError("malformed SMILES, missing ']'")

		# quick chirality specification check
		chiral_i = symbol.find('@')
		if symbol[chiral_i + 1].isalpha() and symbol[chiral_i + 1] != 'H':
		raise ValueError("chiral specification '{}' not supported"
		.format(symbol))

		elif smiles[i].isdigit(): # one-digit ring number
		symbol = smiles[i]
		symbol_type = RING_TYPE
		i += 1

		elif smiles[i] == '%': # two-digit ring number (e.g. %12)
		symbol = smiles[i + 1: i + 3]
		symbol_type = RING_TYPE
		i += 3

		# 1. rings whose right number are bonded to this atom (e.g. ...1...X1)
		# 2. rings whose left number are bonded to this atom (e.g. X1...1...)
		# 3. branches and other (e.g. X(...)...)
		partition = [[], [], []]
		for i, bond in enumerate(out_bonds):
		if not bond.ring_bond:
		partition[2].append(i)
		elif bond.src < bond.dst:
		partition[1].append(i)
		else:
		raise ValueError("unrecognized symbol '{}'".format(smiles[i]))
		partition[0].append(i)
		partition[1].sort(key=lambda x: out_bonds[x].dst)

		yield bond, symbol, symbol_type
		# construct permutation
		perm = partition[0] + partition[1] + partition[2]
		count = 0
		for i in range(len(perm)):
		for j in range(i + 1, len(perm)):
		if perm[i] > perm[j]:
		count += 1
		return count % 2 != 0 # if odd permutation, should invert chirality


		def _translate_smiles(smiles: str) -> str:
		"""A helper for ``selfies.encoder``, which translates a SMILES into a
		SELFIES (assuming the input SMILES contains no dots).
		def _fragment_to_selfies(mol, bond_into_root, root):
		derived = SinglyLinkedList()

		:param smiles: the SMILES to be translated.
		:return: the SELFIES translation of SMILES.
		"""
		bond_into_curr, curr = bond_into_root, root
		while True:
		curr_atom = mol.get_atom(curr)
		derived.append(_atom_to_selfies(bond_into_curr, curr_atom))

		smiles_gen = _parse_smiles(smiles)
		out_bonds = mol.get_out_dirbonds(curr)
		for i, bond in enumerate(out_bonds):

		char_set = set(smiles)
		if any(c in char_set for c in ['c', 'n', 'o', 'p', 'a', 's']):
		smiles_gen = kekulize_parser(smiles_gen)
		if bond.ring_bond:
		if bond.src < bond.dst:
		continue

		# a simple mutable counter to track which atom was the i-th derived atom
		derive_counter = [0]
		rev_bond = mol.get_dirbond(src=bond.dst, dst=bond.src)
		ring_len = bond.src - bond.dst
		Q_as_symbols = get_selfies_from_index(ring_len - 1)
		ring_symbol = "[{}Ring{}]".format(
		_ring_bonds_to_selfies(rev_bond, bond),
		len(Q_as_symbols)
		)

		# a dictionary to keep track of the rings to be made. If a ring with id
		# X is connected to the i-th and j-th derived atoms (i < j) with bond
		# symbol s, then after the i-th atom is derived, rings[X] = (s, i).
		# As soon as the j-th atom is derived, rings[X] is removed from <rings>,
		# and the ring is made.
		rings = {}
		derived.append(ring_symbol)
		for symbol in Q_as_symbols:
		derived.append(symbol)

		selfies, _ = _translate_smiles_derive(smiles_gen, rings, derive_counter)
		elif i == len(out_bonds) - 1:
		bond_into_curr, curr = bond, bond.dst

		if rings:
		raise ValueError("malformed ring numbering or ring numbering "
		"across a dot symbol")

		return selfies


		def _translate_smiles_derive(smiles_gen: Iterable[Tuple[str, str, int]],
		rings: Dict[int, Tuple[str, int]],
		counter: List[int]) -> Tuple[str, int]:
		"""Recursive helper for _translate_smiles.

		Derives the SELFIES from a SMILES, and returns a tuple of (1) the
		translated SELFIES and (2) the symbol length of the translated SELFIES.

		:param smiles_gen: an iterable of the symbols (and their types)
		of the SMILES to be translated, created by ``_parse_smiles``.
		:param rings: See ``rings`` in ``_translate_smiles``.
		:param counter: a one-element list that serves as a mutable counter.
		See ``derived_counter`` in ``_translate_smiles``.
		:return: A tuple of the translated SELFIES and its symbol length.
		"""

		selfies = ""
		selfies_len = 0
		prev_idx = -1

		for bond, symbol, symbol_type in smiles_gen:

		if bond == '-': # ignore explicit single bonds
		bond = ''

		if symbol_type == ATOM_TYPE:
		if symbol[0] == '[':
		selfies += "[{}{}expl]".format(bond, symbol[1:-1])
		else:
		selfies += "[{}{}]".format(bond, symbol)
		prev_idx = counter[0]
		counter[0] += 1
		selfies_len += 1
		branch = _fragment_to_selfies(mol, bond, bond.dst)
		Q_as_symbols = get_selfies_from_index(len(branch) - 1)
		branch_symbol = "[{}Branch{}]".format(
		_bond_to_selfies(bond, show_stereo=False),
		len(Q_as_symbols)
		)

		elif symbol_type == BRANCH_TYPE:
		if symbol == '(':
		derived.append(branch_symbol)
		for symbol in Q_as_symbols:
		derived.append(symbol)
		derived.extend(branch)

		# NOTE: looping inside a loop on a generator will produce
		# expected behaviour in this case.
		# end of chain
		if (not out_bonds) or out_bonds[-1].ring_bond:
		break

		branch, branch_len = \
		_translate_smiles_derive(smiles_gen, rings, counter)
		return derived

		N_as_symbols = get_symbols_from_n(branch_len - 1)
		bond_num = get_num_from_bond(bond)

		selfies += "[Branch{}_{}]".format(len(N_as_symbols), bond_num)
		selfies += ''.join(N_as_symbols) + branch
		selfies_len += 1 + len(N_as_symbols) + branch_len
		def _bond_to_selfies(bond, show_stereo=True):
		if not show_stereo and (bond.order == 1):
		return ""
		return bond_to_smiles(bond)

		else: # symbol == ')'
		break

		else: # symbol_type == RING_TYPE
		ring_id = int(symbol)
		def _ring_bonds_to_selfies(lbond, rbond):
		assert lbond.order == rbond.order

		if ring_id in rings:
		left_bond, left_end = rings.pop(ring_id)
		right_bond, right_end = bond, prev_idx
		if (lbond.order != 1) or all(b.stereo is None for b in (lbond, rbond)):
		return _bond_to_selfies(lbond, show_stereo=False)
		else:
		bond_char = "-" if (lbond.stereo is None) else lbond.stereo
		bond_char += "-" if (rbond.stereo is None) else rbond.stereo
		return bond_char

		ring_len = right_end - left_end
		N_as_symbols = get_symbols_from_n(ring_len - 1)

		if left_bond != '':
		selfies += "[Expl{}Ring{}]".format(left_bond,
		len(N_as_symbols))
		elif right_bond != '':
		selfies += "[Expl{}Ring{}]".format(right_bond,
		len(N_as_symbols))
		else:
		selfies += "[Ring{}]".format(len(N_as_symbols))

		selfies += ''.join(N_as_symbols)
		selfies_len += 1 + len(N_as_symbols)

		else:
		rings[ring_id] = (bond, prev_idx)

		return selfies, selfies_len
		def _atom_to_selfies(bond, atom):
		assert not atom.is_aromatic
		bond_char = "" if (bond is None) else _bond_to_selfies(bond)
		return "[{}{}]".format(bond_char, atom_to_smiles(atom, brackets=False))

+157

-377

selfies/grammar_rules.py

		@@ -1,428 +0,208 @@
		from itertools import product
		from typing import Dict, List, Optional, Set, Tuple
		import functools
		import itertools
		import re
		from typing import Any, List, Optional, Tuple

		default_bond_constraints = {
		'H': 1, 'F': 1, 'Cl': 1, 'Br': 1, 'I': 1,
		'O': 2, 'O+1': 3, 'O-1': 1,
		'N': 3, 'N+1': 4, 'N-1': 2,
		'C': 4, 'C+1': 5, 'C-1': 3,
		'P': 5, 'P+1': 6, 'P-1': 4,
		'S': 6, 'S+1': 7, 'S-1': 5,
		'?': 8
		}

		octet_rule_bond_constraints = dict(default_bond_constraints)
		octet_rule_bond_constraints.update(
		{'S': 2, 'S+1': 3, 'S-1': 1, 'P': 3, 'P+1': 4, 'P-1': 2}
		from selfies.constants import (
		ELEMENTS,
		INDEX_ALPHABET,
		INDEX_CODE,
		ORGANIC_SUBSET
		)
		from selfies.mol_graph import Atom
		from selfies.utils.smiles_utils import smiles_to_bond

		hypervalent_bond_constraints = dict(default_bond_constraints)
		hypervalent_bond_constraints.update(
		{'Cl': 7, 'Br': 7, 'I': 7, 'N': 5}
		)

		_bond_constraints = default_bond_constraints
		def process_atom_symbol(symbol: str) -> Optional[Tuple[Any, Atom]]:
		try:
		output = _PROCESS_ATOM_CACHE[symbol]
		except KeyError:
		output = _process_atom_selfies_no_cache(symbol)
		if output is None:
		return None
		_PROCESS_ATOM_CACHE[symbol] = output

		bond_info, atom_fac = output
		atom = atom_fac()
		if atom.bonding_capacity < 0:
		return None # too many Hs (e.g. [CH9]
		return bond_info, atom

		def get_semantic_robust_alphabet() -> Set[str]:
		"""Returns a subset of all symbols that are semantically constrained
		by :mod:`selfies`.

		These semantic constraints can be configured with
		:func:`selfies.set_semantic_constraints`.
		def process_branch_symbol(symbol: str) -> Optional[Tuple[int, int]]:
		try:
		return _PROCESS_BRANCH_CACHE[symbol]
		except KeyError:
		return None

		:return: a subset of all symbols that are semantically constrained.
		"""

		alphabet_subset = set()
		def process_ring_symbol(symbol: str) -> Optional[Tuple[int, int, Any]]:
		try:
		return _PROCESS_RING_CACHE[symbol]
		except KeyError:
		return None

		organic_subset = {'B', 'C', 'N', 'O', 'S', 'P', 'F', 'Cl', 'Br', 'I'}
		bonds = {'': 1, '=': 2, '#': 3}

		# add atomic symbols
		for (a, c), (b, m) in product(_bond_constraints.items(), bonds.items()):

		if (m > c) or (a == '?'):
		continue

		if a in organic_subset:
		symbol = "[{}{}]".format(b, a)
		else:
		symbol = "[{}{}expl]".format(b, a)

		alphabet_subset.add(symbol)

		# add branch and ring symbols
		for i in range(1, 4):
		alphabet_subset.add("[Ring{}]".format(i))
		alphabet_subset.add("[Expl=Ring{}]".format(i))

		for j in range(1, 4):
		alphabet_subset.add("[Branch{}_{}]".format(i, j))

		return alphabet_subset


		def get_default_constraints() -> Dict[str, int]:
		"""Returns the preset "default" bond constraint settings.

		:return: the default constraint settings.
		"""

		global default_bond_constraints
		return dict(default_bond_constraints)


		def get_octet_rule_constraints() -> Dict[str, int]:
		"""Returns the preset "octet rule" bond constraint settings. These
		constraints are a harsher version of the default constraints, so that
		the `octet rule <https://en.wikipedia.org/wiki/Octet_rule>`_
		is obeyed. In particular, ``S`` and ``P`` are
		restricted to a 2 and 3 bond capacity, respectively (and similarly with
		``S+``, ``S-``, ``P+``, ``P-``).

		:return: the octet rule constraint settings.
		"""

		global octet_rule_bond_constraints
		return dict(octet_rule_bond_constraints)


		def get_hypervalent_constraints() -> Dict[str, int]:
		"""Returns the preset "hypervalent" bond constraint settings. These
		constraints are a relaxed version of the default constraints, to allow
		for `hypervalent molecules
		<https://en.wikipedia.org/wiki/Hypervalent_molecule>`_.
		In particular, ``Cl``, ``Br``, and ``I``
		are relaxed to a 7 bond capacity, and ``N`` is relaxed to a 5 bond
		capacity.

		:return: the hypervalent constraint settings.
		"""

		global hypervalent_bond_constraints
		return dict(hypervalent_bond_constraints)


		def get_semantic_constraints() -> Dict[str, int]:
		"""Returns the semantic bond constraints that :mod:`selfies` is currently
		operating on.

		Returned is the argument of the most recent call of
		:func:`selfies.set_semantic_constraints`, or the default bond constraints
		if the function has not been called yet. Once retrieved, it is copied and
		then returned. See :func:`selfies.set_semantic_constraints` for further
		explanation.

		:return: the bond constraints :mod:`selfies` is currently operating on.
		"""

		global _bond_constraints
		return dict(_bond_constraints)


		def set_semantic_constraints(
		bond_constraints: Optional[Dict[str, int]] = None) -> None:
		"""Configures the semantic constraints of :mod:`selfies`.

		The SELFIES grammar is enforced dynamically from a dictionary
		``bond_constraints``. The keys of the dictionary are atoms and/or ions
		(e.g. ``I``, ``Fe+2``). To denote an ion, use the format ``E+C``
		or ``E-C``, where ``E`` is an element and ``C`` is a positive integer.
		The corresponding value is the maximum number of bonds that atom or
		ion can make, between 1 and 8 inclusive. For example, one may have:

		* ``bond_constraints['I'] = 1``
		* ``bond_constraints['C'] = 4``

		:func:`selfies.decoder` will only generate SMILES that respect the bond
		constraints specified by the dictionary. In the example above, both
		``'[C][=I]'`` and ``'[I][=C]'`` will be translated to ``'CI'`` and
		``'IC'`` respectively, because ``I`` has been configured to make one bond
		maximally.

		If an atom or ion is not specified in ``bond_constraints``, it will
		by default be constrained to 8 bonds. To change the default setting
		for unrecognized atoms or ions, set ``bond_constraints['?']`` to the
		desired integer (between 1 and 8 inclusive).

		:param bond_constraints: a dictionary representing the semantic
		constraints the updated SELFIES will operate upon. Defaults to
		``None``; in this case, a default dictionary will be used.
		:return: ``None``.
		"""

		global _bond_constraints

		if bond_constraints is None:
		_bond_constraints = default_bond_constraints

		else:

		# error checking
		if '?' not in bond_constraints:
		raise ValueError("bond_constraints missing '?' as a key.")

		for key, value in bond_constraints.items():
		if not (1 <= value <= 8):
		raise ValueError("bond_constraints['{}'] not between "
		"1 and 8 inclusive.".format(key))

		_bond_constraints = dict(bond_constraints)


		# Symbol State Dict Functions ==============================================


		def get_next_state(symbol: str, state: int) -> Tuple[str, int]:
		"""Enforces the grammar rules for standard SELFIES symbols.

		Given the current non-branch, non-ring symbol and current derivation
		state, retrieves the derived SMILES symbol and the next derivation
		state.

		:param symbol: a SELFIES symbol that is not a Ring or Branch.
		:param state: the current derivation state.
		:return: a tuple of (1) the derived symbol, and
		(2) the next derivation state.
		"""

		if symbol == '[epsilon]':
		return ('', 0) if state == 0 else ('', -1)

		# convert to smiles symbol
		bond = ''
		if symbol[1] in {'/', '\\', '=', '#'}:
		bond = symbol[1]
		bond_num = get_num_from_bond(bond)

		if symbol[-5:] == 'expl]': # e.g. [C@@Hexpl]
		smiles_symbol = "[{}]".format(symbol[1 + len(bond):-5])
		else:
		smiles_symbol = symbol[1 + len(bond):-1]

		# get bond capacity
		element, h_count, charge = parse_atom_symbol(smiles_symbol)

		if charge == 0:
		atom_or_ion = element
		else:
		atom_or_ion = "{}{:+}".format(element, charge)

		max_bonds = _bond_constraints.get(atom_or_ion,
		_bond_constraints['?'])

		if (h_count > max_bonds) or (h_count == max_bonds and state > 0):
		raise ValueError("too many Hs in symbol '{}'; consider "
		"adjusting bond constraints".format(symbol))
		max_bonds -= h_count # hydrogens consume 1 bond

		# calculate next state
		def next_atom_state(
		bond_order: int, bond_cap: int, state: int
		) -> Tuple[int, Optional[int]]:
		if state == 0:
		bond = ''
		next_state = max_bonds
		else:
		if bond_num > min(state, max_bonds):
		bond_num = min(state, max_bonds)
		bond = get_bond_from_num(bond_num)
		bond_order = 0

		next_state = max_bonds - bond_num
		if next_state == 0:
		next_state = -1
		bond_order = min(bond_order, state, bond_cap)
		bonds_left = bond_cap - bond_order
		next_state = None if (bonds_left == 0) else bonds_left
		return bond_order, next_state

		return (bond + smiles_symbol), next_state

		def next_branch_state(
		branch_type: int, state: int
		) -> Tuple[int, Optional[int]]:
		assert 1 <= branch_type <= 3
		assert state > 1

		# Branch State Dict Functions =================================================
		branch_init_state = min(state - 1, branch_type)
		next_state = state - branch_init_state
		return branch_init_state, next_state


		def get_next_branch_state(branch_symbol: str, state: int) -> Tuple[int, int]:
		"""Enforces the grammar rules for SELFIES Branch symbols.
		def next_ring_state(
		ring_type: int, state: int
		) -> Tuple[int, Optional[int]]:
		assert state > 0

		Given the branch symbol and current derivation state, retrieves
		the initial branch derivation state (i.e. the derivation state that the
		new branch begins on), and the next derivation state (i.e. the derivation
		state after the branch is created).
		bond_order = min(ring_type, state)
		bonds_left = state - bond_order
		next_state = None if (bonds_left == 0) else bonds_left
		return bond_order, next_state

		:param branch_symbol: the branch symbol (e.g. [Branch1_2], [Branch3_1])
		:param state: the current derivation state.
		:return: a tuple of (1) the initial branch state, and
		(2) the next derivation state.
		"""

		branch_type = int(branch_symbol[-2]) # branches of the form [BranchL_X]

		if not (1 <= branch_type <= 3):
		raise ValueError("unknown branch symbol '{}'".format(branch_symbol))

		if 2 <= state <= 8:
		branch_init_state = min(state - 1, branch_type)
		next_state = state - branch_init_state
		return branch_init_state, next_state
		else:
		return -1, state


		# SELFIES Symbol to N Functions ============================================

		_index_alphabet = ['[C]', '[Ring1]', '[Ring2]',
		'[Branch1_1]', '[Branch1_2]', '[Branch1_3]',
		'[Branch2_1]', '[Branch2_2]', '[Branch2_3]',
		'[O]', '[N]', '[=N]', '[=C]', '[#C]', '[S]', '[P]']

		# _alphabet_code takes as a key a SELFIES symbol, and its corresponding value
		# is the index of the key.

		_alphabet_code = {c: i for i, c in enumerate(_index_alphabet)}


		def get_n_from_symbols(*symbols: List[str]) -> int:
		"""Computes N from a list of SELFIES symbols.

		Converts a list of SELFIES symbols [c_1, ..., c_n] into a number N.
		This is done by converting each symbol c_n to an integer idx(c_n) via
		``_alphabet_code``, and then treating the list as a number in base
		len(_alphabet_code). If a symbol is unrecognized, it is given value 0 by
		default.

		:param symbols: a list of SELFIES symbols.
		:return: the corresponding N for ``symbols``.
		"""

		N = 0
		def get_index_from_selfies(*symbols: List[str]) -> int:
		index = 0
		for i, c in enumerate(reversed(symbols)):
		N_i = _alphabet_code.get(c, 0) * (len(_alphabet_code) ** i)
		N += N_i
		return N
		index += INDEX_CODE.get(c, 0) * (len(INDEX_CODE) ** i)
		return index


		def get_symbols_from_n(n: int) -> List[str]:
		"""Converts an integer n into a list of SELFIES symbols that, if
		passed into ``get_n_from_symbols`` in that order, would have produced n.
		def get_selfies_from_index(index: int) -> List[str]:
		if index < 0:
		raise IndexError()
		elif index == 0:
		return [INDEX_ALPHABET[0]]

		:param n: an integer from 0 to 4095 inclusive.
		:return: a list of SELFIES symbols representing n in base
		``len(_alphabet_code)``.
		"""

		if n == 0:
		return [_index_alphabet[0]]

		symbols = []
		base = len(_index_alphabet)
		while n:
		symbols.append(_index_alphabet[n % base])
		n //= base
		base = len(INDEX_ALPHABET)
		while index:
		symbols.append(INDEX_ALPHABET[index % base])
		index //= base
		return symbols[::-1]


		# Helper Functions ============================================================
		# =============================================================================
		# Caches (for computational speed)
		# =============================================================================


		def get_num_from_bond(bond_symbol: str) -> int:
		"""Retrieves the bond multiplicity from a SMILES symbol representing
		a bond. If ``bond_symbol`` is not known, 1 is returned by default.
		SELFIES_ATOM_PATTERN = re.compile(
		r"^[\[]" # opening square bracket [
		r"([=#/\\]?)" # bond char
		r"(\d*)" # isotope number (optional, e.g. 123, 26)
		r"([A-Z][a-z]?)" # element symbol
		r"([@]{0,2})" # chiral_tag (optional, only @ and @@ supported)
		r"((?:[H]\d)?)" # H count (optional, e.g. H1, H3)
		r"((?:[+-][1-9]+)?)" # charge (optional, e.g. +1)
		r"[]]$" # closing square bracket ]
		)

		:param bond_symbol: a SMILES symbol representing a bond.
		:return: the bond multiplicity of ``bond_symbol``, or 1 if
		``bond_symbol`` is not recognized.
		"""

		if bond_symbol == "=":
		return 2
		elif bond_symbol == "#":
		return 3
		else:
		return 1
		def _process_atom_selfies_no_cache(symbol):
		m = SELFIES_ATOM_PATTERN.match(symbol)
		if m is None:
		return None
		bond_char, isotope, element, chirality, h_count, charge = m.groups()

		if symbol[1 + len(bond_char):-1] in ORGANIC_SUBSET:
		atom_fac = functools.partial(Atom, element=element, is_aromatic=False)
		return smiles_to_bond(bond_char), atom_fac

		def get_bond_from_num(n: int) -> str:
		"""Returns the SMILES symbol representing a bond with multiplicity
		``n``. More specifically, ``'' = 1`` and ``'=' = 2`` and ``'#' = 3``.
		isotope = None if (isotope == "") else int(isotope)
		if element not in ELEMENTS:
		return None
		chirality = None if (chirality == "") else chirality

		:param n: either 1, 2, 3.
		:return: the SMILES symbol representing a bond with multiplicity ``n``.
		"""
		s = h_count
		if s == "":
		h_count = 0
		else:
		h_count = int(s[1:])

		return ('', '=', '#')[n - 1]
		s = charge
		if s == "":
		charge = 0
		else:
		charge = int(s[1:])
		charge *= 1 if (s[0] == "+") else -1

		atom_fac = functools.partial(
		Atom,
		element=element,
		is_aromatic=False,
		isotope=isotope,
		chirality=chirality,
		h_count=h_count,
		charge=charge
		)

		def find_element(atom_symbol: str) -> Tuple[int, int]:
		"""Returns the indices of the element component of a SMILES atom symbol.
		return smiles_to_bond(bond_char), atom_fac

		That is, if atom_symbol[i:j] is the element substring of the SMILES atom,
		then (i, j) is returned. For example:
		* _find_element('b') = (0, 1).
		* _find_element('B') = (0, 1).
		* _find_element('[13C]') = (3, 4).
		* _find_element('[nH+]') = (1, 2).

		:param atom_symbol: a SMILES atom.
		:return: a tuple of the indices of the element substring of
		``atom_symbol``.
		"""
		def _build_atom_cache():
		cache = dict()
		common_symbols = [
		"[#C+1]", "[#C-1]", "[#C]", "[#N+1]", "[#N]", "[#O+1]", "[#P+1]",
		"[#P-1]", "[#P]", "[#S+1]", "[#S-1]", "[#S]", "[=C+1]", "[=C-1]",
		"[=C]", "[=N+1]", "[=N-1]", "[=N]", "[=O+1]", "[=O]", "[=P+1]",
		"[=P-1]", "[=P]", "[=S+1]", "[=S-1]", "[=S]", "[Br]", "[C+1]", "[C-1]",
		"[C]", "[Cl]", "[F]", "[H]", "[I]", "[N+1]", "[N-1]", "[N]", "[O+1]",
		"[O-1]", "[O]", "[P+1]", "[P-1]", "[P]", "[S+1]", "[S-1]", "[S]"
		]

		if atom_symbol[0] != '[':
		return 0, len(atom_symbol)
		for symbol in common_symbols:
		cache[symbol] = _process_atom_selfies_no_cache(symbol)
		return cache

		i = 1
		while atom_symbol[i].isdigit(): # skip isotope number
		i += 1

		if atom_symbol[i + 1].isalpha() and atom_symbol[i + 1] != 'H':
		return i, i + 2
		else:
		return i, i + 1
		def _build_branch_cache():
		cache = dict()
		for L in range(1, 4):
		for bond_char in ["", "=", "#"]:
		symbol = "[{}Branch{}]".format(bond_char, L)
		cache[symbol] = (smiles_to_bond(bond_char)[0], L)
		return cache


		def parse_atom_symbol(atom_symbol: str) -> Tuple[str, int, int]:
		"""Parses a SMILES atom symbol and returns its element component,
		number of associated hydrogens, and charge.
		def _build_ring_cache():
		cache = dict()
		for L in range(1, 4):
		# [RingL], [=RingL], [#RingL]
		for bond_char in ["", "=", "#"]:
		symbol = "[{}Ring{}]".format(bond_char, L)
		order, stereo = smiles_to_bond(bond_char)
		cache[symbol] = (order, L, (stereo, stereo))

		See http://opensmiles.org/opensmiles.html for the formal grammar
		of SMILES atom symbols. Note that only @ and @@ are currently supported
		as chiral specifications.
		# [-/RingL], [\/RingL], [\-RingL], ...
		for lchar, rchar in itertools.product(["-", "/", "\\"], repeat=2):
		if lchar == rchar == "-":
		continue
		symbol = "[{}{}Ring{}]".format(lchar, rchar, L)
		order, lstereo = smiles_to_bond(lchar)
		order, rstereo = smiles_to_bond(rchar)
		cache[symbol] = (order, L, (lstereo, rstereo))
		return cache

		:param atom_symbol: a SMILES atom symbol.
		:return: a tuple of (1) the element of ``atom_symbol``, (2) the hydrogen
		count, and (3) the charge.
		"""

		if atom_symbol[0] != '[':
		return atom_symbol, 0, 0
		_PROCESS_ATOM_CACHE = _build_atom_cache()

		atom_start, atom_end = find_element(atom_symbol)
		i = atom_end
		_PROCESS_BRANCH_CACHE = _build_branch_cache()

		# skip chirality
		if atom_symbol[i] == '@': # e.g. @
		i += 1
		if atom_symbol[i] == '@': # e.g. @@
		i += 1

		h_count = 0 # hydrogen count
		if atom_symbol[i] == 'H':
		h_count = 1

		i += 1
		if atom_symbol[i].isdigit(): # e.g. [CH2]
		h_count = int(atom_symbol[i])
		i += 1

		charge = 0 # charge count
		if atom_symbol[i] in ('+', '-'):
		charge = 1 if atom_symbol[i] == '+' else -1

		i += 1
		if atom_symbol[i] in ('+', '-'): # e.g. [Cu++]
		while atom_symbol[i] in ('+', '-'):
		charge += (1 if atom_symbol[i] == '+' else -1)
		i += 1

		elif atom_symbol[i].isdigit(): # e.g. [Cu+2]
		s = i
		while atom_symbol[i].isdigit():
		i += 1
		charge *= int(atom_symbol[s:i])

		return atom_symbol[atom_start: atom_end], h_count, charge
		_PROCESS_RING_CACHE = _build_ring_cache()

+2

-2

setup.py

		@@ -10,4 +10,4 @@ #!/usr/bin/env python
		name="selfies",
		version="1.0.4",
		author="Mario Krenn",
		version="2.0.0",
		author="Mario Krenn, Alston Lo, and many other contributors",
		author_email="mario.krenn@utoronto.ca, alan@aspuru.com",
		@@ -14,0 +14,0 @@ description="SELFIES (SELF-referencIng Embedded Strings) is a "

-521

selfies/kekulize.py

		from typing import Dict, Iterable, List, Set, Tuple, Union

		from selfies.grammar_rules import find_element, get_num_from_bond, \
		parse_atom_symbol

		ATOM_TYPE = 1
		BRANCH_TYPE = 2
		RING_TYPE = 3


		def kekulize_parser(smiles_gen: Iterable[Tuple[str, str, int]]) \
		-> Iterable[Tuple[str, str, int]]:
		"""Kekulizes a SMILES in the form of an iterable.

		This method intercepts the output of ``encoder._parse_smiles``, and
		acts as filter that kekulizes the SMILES. The motivation for having
		this setup is that string parsing and concatenation is minimized,
		as the parsing is already done by ``_parse_smiles``.

		Reference: https://depth-first.com/articles/2020/02/10/a-comprehensive
		-treatment-of-aromaticity-in-the-smiles-language/

		:param smiles_gen: an iterator returned by ``encoder._parse_smiles``.
		:return: an iterator representing the kekulized SMILES, in the same
		format as that returned by ``encoder._parse_smiles``.
		"""

		# save to list, so the iterator can be used across multiple functions
		# change elements from tuple -> list to allow in-place modifications
		smiles_symbols = list(map(list, smiles_gen))

		mol_graph = MolecularGraph(smiles_symbols)

		rings = {}
		_build_molecular_graph(mol_graph, smiles_symbols, rings)

		if mol_graph.aro_indices:
		_kekulize(mol_graph)

		for x in mol_graph.smiles_symbols: # return as iterator
		yield tuple(x)


		def _build_molecular_graph(graph,
		smiles_symbols: List[List[Union[str, int]]],
		rings: Dict[int, Tuple[int, int]],
		prev_idx: int = -1,
		curr_idx: int = -1) -> int:
		"""From the iterator returned by ``encoder._parse_smiles``, builds
		a graph representation of the molecule.

		This is done by iterating through ``smiles_symbols``, and then adding bonds
		to the molecular graph. Note that ``smiles_symbols`` is mutated in this
		method, for convenience.

		:param graph: the MolecularGraph to be added to.
		:param smiles_symbols: a list created from the iterator returned
		by ``encoder._parse_smiles``.
		:param rings: an, initially, empty dictionary used to keep track of
		rings to be made.
		:param prev_idx:
		:param curr_idx:
		:return: the last index of ``smiles_symbols`` that was processed.
		"""

		while curr_idx + 1 < len(smiles_symbols):

		curr_idx += 1
		_, symbol, symbol_type = smiles_symbols[curr_idx]

		if symbol_type == ATOM_TYPE:
		if prev_idx >= 0:
		graph.add_bond(prev_idx, curr_idx, curr_idx)
		prev_idx = curr_idx

		elif symbol_type == BRANCH_TYPE:
		if symbol == '(':
		curr_idx = _build_molecular_graph(graph, smiles_symbols, rings,
		prev_idx, curr_idx)
		else:
		break

		else:
		if symbol in rings:
		left_idx, left_bond_idx = rings.pop(symbol)
		right_idx, right_bond_idx = prev_idx, curr_idx

		# we mutate one bond index to be '', so that we
		# can faithfully represent the bond to be localized at
		# one index. For example, C=1CCCC=1 --> C1CCCC=1.

		if smiles_symbols[left_bond_idx][0] != '':
		bond_idx = left_bond_idx
		smiles_symbols[right_bond_idx][0] = ''
		else:
		bond_idx = right_bond_idx
		smiles_symbols[left_bond_idx][0] = ''

		graph.add_bond(left_idx, right_idx, bond_idx)
		else:
		rings[symbol] = (prev_idx, curr_idx)

		return curr_idx


		def _kekulize(mol_graph) -> None:
		"""Kekulizes the molecular graph.

		:param mol_graph: a molecular graph to be kekulized.
		:return: None.
		"""

		mol_graph.prune_to_pi_subgraph()

		visited = set()
		for i in mol_graph.get_nodes_by_num_edges():
		success = mol_graph.dfs_assign_bonds(i, visited, set(), set())
		if not success:
		raise ValueError("kekulization algorithm failed")

		mol_graph.write_to_smiles_symbols()


		# Aromatic Helper Methods and Classes

		# key = aromatic SMILES element, value = number of valence electrons
		# Note: wild card '*' not supported currently
		_aromatic_valences = {
		'b': 3, 'al': 3, 'c': 4, 'si': 4, 'n': 5, 'p': 5,
		'as': 5, 'o': 6, 's': 6, 'se': 6, 'te': 6
		}


		def _capitalize(atom_symbol: str) -> str:
		"""Capitalizes the element portion of an aromatic SMILES atom symbol,
		converting it into a standard SMILES atom symbol.

		:param atom_symbol: an aromatic SMILES atom symbol.
		:return: the capitalized ``atom_symbol``.
		"""

		s, _ = find_element(atom_symbol)
		return atom_symbol[:s] + atom_symbol[s].upper() + atom_symbol[s + 1:]


		def _is_aromatic(atom_symbol: str) -> bool:
		"""Checks whether a SMILES atom symbol is an aromatic SMILES atom symbol.

		An aromatic SMILES atom symbol is indicated by an element substring
		that is not capitalized.

		:param atom_symbol: a SMILES atom symbol.
		:return: True, if ``atom_symbol`` is an aromatic atom symbol,
		and False otherwise.
		"""

		s, e = find_element(atom_symbol)

		if e == len(atom_symbol): # optimization to prevent string copying
		element = atom_symbol
		else:
		element = atom_symbol[s: e]

		if element[0].isupper(): # check if element is capitalized
		return False

		if element not in _aromatic_valences:
		raise ValueError("unrecognized aromatic symbol '{}'"
		.format(atom_symbol))
		return True


		def _in_pi_subgraph(atom_symbol: str, bonds: Tuple[str]) -> bool:
		"""Checks whether a SMILES atom symbol should be a node in the pi
		subgraph, based on its bonds.

		More specifically, an atom should be a node in the pi subgraph if it has
		an unpaired valence electron, and thus, is able to make a double bond.

		Reference: https://depth-first.com/articles/2020/02/10/a-comprehensive
		-treatment-of-aromaticity-in-the-smiles-language/

		:param atom_symbol: a SMILES atom symbol representing an atom.
		:param bonds: the bonds connected to ``atom_symbol``.
		:return: True if ``atom_symbol`` should be included in the pi subgraph,
		and False otherwise.
		"""

		atom, h_count, charge = parse_atom_symbol(atom_symbol)

		used_electrons = 0
		for b in bonds:
		used_electrons += get_num_from_bond(b)

		# e.g. c1ccccc1
		# this also covers the neutral carbon radical case (e.g. C1=[C]NC=C1),
		# which is treated equivalently to a 1-H carbon (e.g. C1=[CH]NC=C1)
		if (atom == 'c') and (h_count == charge == 0) \
		and (len(bonds) == 2) and ('#' not in bonds):

		h_count += 1 # implied bonded hydrogen

		if h_count > 1:
		raise ValueError("unrecognized aromatic symbol '{}'"
		.format(atom_symbol))

		elif h_count == 1: # e.g. [nH]
		used_electrons += 1

		valence = _aromatic_valences[atom] - charge
		free_electrons = valence - used_electrons
		return free_electrons % 2 != 0


		class MolecularGraph:
		"""A molecular graph.

		This molecular graph operates based on the ``smiles_symbols`` data
		structure. Indices from this list represent nodes or edges, depending
		on whether they point to a SMILES atom(s) or bond.

		:ivar smiles_symbols: the list created from the iterator returned by
		``encoder._parse_smiles``. Serves as the base data structure
		of this class, as everything is communicated through indices
		referring to elements of this list.
		:ivar graph: the key is an index of the atom(s) from ``smiles_symbols``.
		The value is a list of Bond objects representing the connected
		bonds. Represents the actual molecular graph.
		:ivar aro_indices: a set of indices of atom(s) from ``smiles_symbols``
		that are aromatic in the molecular graph.
		"""

		def __init__(self, smiles_symbols: List[List[Union[str, int]]]):
		self.smiles_symbols = smiles_symbols
		self.graph = {}
		self.aro_indices = set()

		def get_atom_symbol(self, idx: int) -> str:
		"""Getter that returns the SMILES symbol representing an atom
		at a specified index.

		:param idx: an index in ``smiles_symbols``.
		:return: the SMILES symbol representing an atom at index
		``idx`` in ``smiles_symbols``.
		"""

		return self.smiles_symbols[idx][1]

		def get_bond_symbol(self, idx: int) -> str:
		"""Getter that returns the SMILES symbol representing a bond at
		a specified index.

		:param idx: an index in ``smiles_symbols``.
		:return: the SMILES symbol representing a bond at index
		``idx`` in ``smiles_symbols``.
		"""

		return self.smiles_symbols[idx][0]

		def get_nodes_by_num_edges(self) -> List[int]:
		"""Returns all nodes (or indices) stored in this molecular graph
		in a semi-sorted order by number of edges.

		This is to optimize the speed of ``dfs_assign_bonds``; starting
		with nodes that have fewer edges will improve computational time
		as there are fewer bond configurations to explore. Instead of fully
		sorting the returned list, a compromise is made, and nodes with exactly
		one edge are added to the list's beginning.

		:return: a list of the nodes (or indices) of this molecular graph,
		semi-sorted by number of edges.
		"""

		ends = [] # nodes with exactly 1 edge
		middles = [] # nodes with 2+ edges

		for idx, edges in self.graph.items():
		if len(edges) > 1:
		middles.append(idx)
		else:
		ends.append(idx)

		ends.extend(middles)
		return ends

		def set_atom_symbol(self, atom_symbol: str, idx: int) -> None:
		"""Setter that updates the SMILES symbol representing an atom(s) at
		a specified index.

		:param atom_symbol: the new value of the atom symbol at ``idx``.
		:param idx: an index in ``smiles_symbols``.
		:return: None.
		"""

		self.smiles_symbols[idx][1] = atom_symbol

		def set_bond_symbol(self, bond_symbol: str, idx: int) -> None:
		"""Setter that updates the SMILES symbol representing a bond at
		a specified index.

		:param bond_symbol: the new value of the bond symbol at ``idx``.
		:param idx: an index in ``smiles_symbols``.
		:return: None.
		"""

		self.smiles_symbols[idx][0] = bond_symbol

		def add_bond(self, idx_a: int, idx_b: int, bond_idx: int) -> None:
		"""Adds a bond (or edge) to this molecular graph between atoms
		(or nodes) at two specified indices.

		:param idx_a: the index of one atom (or node) of this bond.
		:param idx_b:the index of one atom (or node) of this bond.
		:param bond_idx: the index of this bond.
		:return: None.
		"""

		atom_a = self.get_atom_symbol(idx_a)
		atom_b = self.get_atom_symbol(idx_b)
		atom_a_aro = (idx_a in self.aro_indices) or _is_aromatic(atom_a)
		atom_b_aro = (idx_b in self.aro_indices) or _is_aromatic(atom_b)
		bond_symbol = self.get_bond_symbol(bond_idx)

		if atom_a_aro:
		self.aro_indices.add(idx_a)

		if atom_b_aro:
		self.aro_indices.add(idx_b)

		if bond_symbol == ':':
		self.aro_indices.add(idx_a)
		self.aro_indices.add(idx_b)

		# Note: ':' bonds are edited here to ''
		self.set_bond_symbol('', bond_idx)
		bond_symbol = ''

		edge = Bond(idx_a, idx_b, bond_symbol, bond_idx)

		self.graph.setdefault(idx_a, []).append(edge)
		self.graph.setdefault(idx_b, []).append(edge)

		def prune_to_pi_subgraph(self) -> None:
		"""Removes nodes and edges from this molecular graph such that
		it becomes the pi subgraph.

		The remaining graph will only contain aromatic atoms (or nodes)
		that belong in the pi-subgraph, and the bonds that are aromatic
		and between such atoms.

		:return: None.
		"""

		# remove non-aromatic nodes
		non_aromatic = self.graph.keys() - self.aro_indices
		for i in non_aromatic:
		self.graph.pop(i)

		# remove non-pi subgraph nodes
		for i in self.aro_indices:

		atom = self.get_atom_symbol(i)
		bonds = tuple(edge.bond_symbol for edge in self.graph[i])

		if not _in_pi_subgraph(atom, bonds):
		self.graph.pop(i)

		# remove irrelevant edges
		for idx, edges in self.graph.items():

		keep = list(filter(lambda e: (e.idx_a in self.graph)
		and (e.idx_b in self.graph)
		and (e.bond_symbol == ''),
		edges))
		self.graph[idx] = keep

		def dfs_assign_bonds(self, idx: int,
		visited: Set[int],
		matched_nodes: Set[int],
		matched_edges) -> bool:
		"""After calling ``prune_to_pi_subgraph``, this method assigns
		double bonds between pairs of nodes such that every node is
		paired or matched.

		This is done recursively in a depth-first search fashion.

		:param idx: the index of the current atom (or node).
		:param visited: a set of the indices of nodes that have been visited.
		:param matched_nodes: a set of the indices of nodes that have been
		matched, i.e., assigned a double bond.
		:param matched_edges: a set of the bonds that have been matched.
		:return: True, if a valid bond assignment was found; False otherwise.
		"""

		if idx in visited:
		return True

		edges = self.graph[idx]

		if idx in matched_nodes:

		# recursively try to match adjacent nodes. If the matching
		# fails, then we must backtrack.
		visited_save = visited.copy()

		visited.add(idx)
		for e in edges:
		adj = e.other_end(idx)
		if not self.dfs_assign_bonds(adj, visited,
		matched_nodes,
		matched_edges):
		visited &= visited_save
		return False
		return True

		else:

		# list of candidate edges that can become a double bond
		candidates = list(
		filter(lambda i: i.other_end(idx) not in matched_nodes, edges)
		)

		if not candidates:
		return False # idx is unmatched, but all adj nodes are matched

		matched_edges_save = matched_edges.copy()

		for e in candidates:

		# match nodes connected by c
		matched_nodes.add(e.idx_a)
		matched_nodes.add(e.idx_b)
		matched_edges.add(e)

		success = self.dfs_assign_bonds(idx, visited,
		matched_nodes,
		matched_edges)

		if success:
		e.bond_symbol = '='
		return True
		else: # the matching failed, so we must backtrack

		for edge in matched_edges - matched_edges_save:
		edge.bond_symbol = ''
		matched_nodes.discard(edge.idx_a)
		matched_nodes.discard(edge.idx_b)

		matched_edges &= matched_edges_save

		return False

		def write_to_smiles_symbols(self):
		"""Updates and mutates ``self.smiles_symbols`` with the information
		contained in ``self.graph``.

		After kekulizing the molecular graph, this method is called to
		merge the new information back into the original data structure.

		:return: None.
		"""

		# capitalize aromatic molecules
		for idx in self.aro_indices:
		self.set_atom_symbol(_capitalize(self.get_atom_symbol(idx)), idx)

		# write bonds
		for edge_list in self.graph.values():
		for edge in edge_list:
		bond_symbol = edge.bond_symbol
		bond_idx = edge.bond_idx

		self.set_bond_symbol(bond_symbol, bond_idx)

		# branches record the next symbol as their bond, so we
		# must update accordingly
		if (bond_idx > 0) and \
		(self.smiles_symbols[bond_idx - 1][2] == BRANCH_TYPE):
		self.set_bond_symbol(bond_symbol, bond_idx - 1)


		class Bond:
		"""Represents a bond or edge in MolecularGraph.

		Recall that the following indices are with respect to ``smiles_symbols``
		in MolecularGraph.

		:ivar idx_a: the index of one atom or node of this bond.
		:ivar idx_b: the index of one atom or node of this bond.
		:ivar bond_symbol: the SMILES symbol representing this bond (e.g. '#').
		:ivar bond_idx: the index of this bond or edge.
		"""

		def __init__(self, idx_a, idx_b, bond_symbol, bond_idx):
		self.idx_a = idx_a
		self.idx_b = idx_b
		self.bond_symbol = bond_symbol
		self.bond_idx = bond_idx

		def __eq__(self, other):
		if isinstance(other, type(self)):
		return (self.idx_a, self.idx_b) == (other.idx_a, other.idx_b)
		return NotImplemented

		def __hash__(self):
		return hash((self.idx_a, self.idx_b))

		def other_end(self, idx):
		"""Given an index representing one end of this bond, returns
		the index representing the other end.

		:param idx: an index of one atom or node of this bond.
		:return: the index of the other atom or node of this bond, or
		None if ``idx`` is an invalid input.
		"""

		if idx == self.idx_a:
		return self.idx_b
		elif idx == self.idx_b:
		return self.idx_a
		return None

-288

selfies/utils.py

		from typing import Dict, Iterable, List, Set, Tuple, Union


		def len_selfies(selfies: str) -> int:
		"""Computes the symbol length of a SELFIES.

		The symbol length is the number of symbols that make up the SELFIES,
		and not the length of the string itself (i.e. ``len(selfies)``).

		:param selfies: a SELFIES.
		:return: the symbol length of ``selfies``.

		:Example:

		>>> import selfies
		>>> selfies.len_selfies('[C][O][C]')
		3
		>>> selfies.len_selfies('[C][=C][F].[C]')
		5
		"""

		return selfies.count("[") + selfies.count(".")


		def split_selfies(selfies: str) -> Iterable[str]:
		"""Splits a SELFIES into its symbols.

		Returns an iterable that yields the symbols of a SELFIES one-by-one
		in the order they appear in the string. SELFIES symbols are always
		either indicated by an open and closed square bracket, or are the ``'.'``
		dot-bond symbol.

		:param selfies: the SELFIES to be read.
		:return: an iterable of the symbols of ``selfies`` in the same order
		they appear in the string.

		:Example:

		>>> import selfies
		>>> list(selfies.split_selfies('[C][O][C]'))
		['[C]', '[O]', '[C]']
		>>> list(selfies.split_selfies('[C][=C][F].[C]'))
		['[C]', '[=C]', '[F]', '.', '[C]']
		"""

		left_idx = selfies.find("[")

		while 0 <= left_idx < len(selfies):
		right_idx = selfies.find("]", left_idx + 1)
		next_symbol = selfies[left_idx: right_idx + 1]
		yield next_symbol

		left_idx = right_idx + 1
		if selfies[left_idx: left_idx + 1] == ".":
		yield "."
		left_idx += 1


		def get_alphabet_from_selfies(selfies_iter: Iterable[str]) -> Set[str]:
		"""Constructs an alphabet from an iterable of SELFIES.

		From an iterable of SELFIES, constructs the minimum-sized set
		of SELFIES symbols such that every SELFIES in the iterable can be
		constructed from symbols from that set. Then, the set is returned.
		Note that the symbol ``'.'`` will not be added as a member of the
		returned set, even if it appears in the input.

		:param selfies_iter: an iterable of SELFIES.
		:return: the SElFIES alphabet built from the SELFIES in ``selfies_iter``.

		:Example:

		>>> import selfies
		>>> selfies_list = ['[C][F][O]', '[C].[O]', '[F][F]']
		>>> alphabet = selfies.get_alphabet_from_selfies(selfies_list)
		>>> sorted(list(alphabet))
		['[C]', '[F]', '[O]']
		"""

		alphabet = set()

		for s in selfies_iter:
		for symbol in split_selfies(s):
		alphabet.add(symbol)

		alphabet.discard(".")

		return alphabet


		def selfies_to_encoding(
		selfies: str,
		vocab_stoi: Dict[str, int],
		pad_to_len: int = -1,
		enc_type: str = 'both'
		) -> Union[List[int], List[List[int]], Tuple[List[int], List[List[int]]]]:
		"""Converts a SELFIES into its label (integer) and/or one-hot encoding.

		A label encoded output will be a list of size ``(N,)`` and a
		one-hot encoded output will be a list of size ``(N, len(vocab_stoi))``;
		where ``N`` is the symbol length of the (potentially padded) SELFIES.
		Note that SELFIES uses the special padding symbol ``[nop]``.

		:param selfies: the SELFIES to be encoded.
		:param vocab_stoi: a dictionary that maps SELFIES symbols (the keys)
		to a non-negative index. The indices of the dictionary
		must contiguous, starting from 0.
		:param pad_to_len: the length the SELFIES is be padded to.
		If ``pad_to_len`` is less than or equal to the symbol
		length of the SELFIES, then no padding is added. Defaults to ``-1``.
		:param enc_type: the type of encoding of the output:
		``label`` or ``one_hot`` or ``both``.
		If the value is ``both``, then a tuple of the label and one-hot
		encoding are returned (in that order). Defaults to ``both``.
		:return: the label encoded and/or one-hot encoded SELFIES.

		:Example:

		>>> import selfies as sf
		>>> sf.selfies_to_encoding('[C][F]', {'[C]': 0, '[F]': 1})
		([0, 1], [[1, 0], [0, 1]])
		"""

		# some error checking
		if enc_type not in ('label', 'one_hot', 'both'):
		raise ValueError("enc_type must be in ('label', 'one_hot', 'both')")

		# pad with [nop]
		if pad_to_len > len_selfies(selfies):
		selfies += "[nop]" * (pad_to_len - len_selfies(selfies))

		# integer encode
		char_list = split_selfies(selfies)
		integer_encoded = [vocab_stoi[char] for char in char_list]

		if enc_type == 'label':
		return integer_encoded

		# one-hot encode
		onehot_encoded = list()
		for index in integer_encoded:
		letter = [0] * len(vocab_stoi)
		letter[index] = 1
		onehot_encoded.append(letter)

		if enc_type == 'one_hot':
		return onehot_encoded
		return integer_encoded, onehot_encoded


		def encoding_to_selfies(
		encoded: Union[List[int], List[List[int]]],
		vocab_itos: Dict[int, str],
		enc_type: str,
		) -> str:
		"""Converts a label (integer) or one-hot encoded list into
		a SELFIES string.

		If the input is label encoded, then a list of size ``(N,)`` is
		expected; and if the input is one-hot encoded, then a 2D list of
		size ``(N, len(vocab_itos))`` is expected.

		:param encoded: a label or one-hot encoded list.
		:param vocab_itos: a dictionary that maps non-negative indices (the keys)
		to SELFIES symbols. The indices of the dictionary
		must be contiguous, starting from 0.
		:param enc_type: the type of encoding of the output:
		``label`` or ``one_hot``.
		:return: the SELFIES string represented by the encoded input.

		:Example:

		>>> import selfies as sf
		>>> one_hot = [[0, 1, 0], [0, 0, 1], [1, 0, 0]]
		>>> vocab_itos = {0: '[nop]', 1: '[C]', 2: '[F]'}
		>>> sf.encoding_to_selfies(one_hot, vocab_itos, enc_type='one_hot')
		'[C][F][nop]'

		"""

		if enc_type not in ('label', 'one_hot'):
		raise ValueError("enc_type must be in ('label', 'one_hot')")

		if enc_type == 'one_hot': # Get integer encoding
		integer_encoded = []
		for row in encoded:
		integer_encoded.append(row.index(1))
		else:
		integer_encoded = encoded

		# Integer encoding -> SELFIES
		char_list = [vocab_itos[i] for i in integer_encoded]
		selfies = "".join(char_list)

		return selfies


		def batch_selfies_to_flat_hot(
		selfies_batch: List[str],
		vocab_stoi: Dict[str, int],
		pad_to_len: int = -1,
		) -> List[List[int]]:
		"""Converts a list of SELFIES into a list of
		flattened one-hot encodings.

		Returned is a list of size ``(batch_size, N * len(vocab_stoi))``;
		where ``N`` is the symbol length of the (potentially padded) SELFIES.
		Note that SELFIES uses the special padding symbol ``[nop]``.

		:param selfies_batch: a list of SELFIES to be converted.
		:param vocab_stoi: a dictionary that maps SELFIES symbols (the keys)
		to a non-negative index. The indices of the dictionary
		must contiguous, starting from 0.
		:param pad_to_len: the length that each SELFIES is be padded to.
		If ``pad_to_len`` is less than or equal to the symbol
		length of the SELFIES, then no padding is added. Defaults to ``-1``.
		:return: the flattened one-hot encoded representations of the SELFIES
		from the batch. This is a 2D list of size
		``(batch_size, N * len(vocab_stoi))``.

		:Example:

		>>> import selfies as sf
		>>> batch = ["[C]", "[C][C]"]
		>>> vocab_stoi = {'[nop]': 0, '[C]': 1}
		>>> sf.batch_selfies_to_flat_hot(batch, vocab_stoi, 2)
		[[0, 1, 1, 0], [0, 1, 0, 1]]

		"""

		hot_list = list()

		for selfies in selfies_batch:
		one_hot = selfies_to_encoding(selfies, vocab_stoi, pad_to_len,
		enc_type='one_hot')
		flattened = [elem for vec in one_hot for elem in vec]
		hot_list.append(flattened)

		return hot_list


		def batch_flat_hot_to_selfies(
		one_hot_batch: List[List[int]],
		vocab_itos: Dict[int, str],
		) -> List[str]:
		"""Convert a batch of flattened one-hot encodings into
		a list of SELFIES.

		We expect ``one_hot_batch`` to be a list of size ``(batch_size, S)``,
		where ``S`` is divisible by the length of the vocabulary.

		:param one_hot_batch: a list of flattened one-hot encoded representations.
		:param vocab_itos: a dictionary that maps non-negative indices (the keys)
		to SELFIES symbols. We expect the indices of the dictionary
		to be contiguous and starting from 0.
		:return: a list of SELFIES strings.

		:Example:

		>>> import selfies as sf
		>>> batch = [[0, 1, 1, 0], [0, 1, 0, 1]]
		>>> vocab_itos = {0: '[nop]', 1: '[C]'}
		>>> sf.batch_flat_hot_to_selfies(batch, vocab_itos)
		['[C][nop]', '[C][C]']

		"""

		selfies_list = []

		for flat_one_hot in one_hot_batch:

		# Reshape to an N x M array where each column represents an alphabet
		# entry and each row is a position in the selfies
		one_hot = []

		M = len(vocab_itos)
		if len(flat_one_hot) % M != 0:
		raise ValueError("size of vector in one_hot_batch not divisible "
		"by the length of the vocabulary.")
		N = len(flat_one_hot) // M

		for i in range(N):
		one_hot.append(flat_one_hot[M * i: M * (i + 1)])

		selfies = encoding_to_selfies(one_hot, vocab_itos, enc_type='one_hot')
		selfies_list.append(selfies)

		return selfies_list

selfies - npm Package Compare versions

Improved metrics