Skip to content

smiles_utils

learn_label_encoding(tokenized_inputs)

Learn a label encoding from a tokenized dataset. The padding token, "[PAD]" is always assigned the label 0.

Parameters:

Name Type Description Default
tokenized_inputs List[List[str]]

SMILES of the molecules in the dataset, tokenized into a list of tokens.

required

Returns:

Type Description
Dict[str, int]

A dictionary mapping SMILES tokens to integer labels.

Source code in s4dd/smiles_utils.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
def learn_label_encoding(tokenized_inputs: List[List[str]]) -> Dict[str, int]:
    """Learn a label encoding from a tokenized dataset. The padding token, `"[PAD]"` is always assigned the label 0.

    Parameters
    ----------
    tokenized_inputs : List[List[str]]
        SMILES of the molecules in the dataset, tokenized into a list of tokens.

    Returns
    -------
    Dict[str, int]
        A dictionary mapping SMILES tokens to integer labels.
    """
    token2label = dict()
    token2label["[PAD]"] = len(token2label)
    for inp in tokenized_inputs:
        for token in inp:
            if token not in token2label:
                token2label[token] = len(token2label)

    return token2label

pad_sequences(sequences, padding_length, padding_value)

Pad sequences to a given length. The padding is done at the end of the sequences. Longer sequences are truncated from the beginning.

Parameters:

Name Type Description Default
sequences List[List[Union[str, int]]

A list of sequences, either tokenized or label encoded SMILES.

required
padding_length int

The length to pad the sequences to.

required
padding_value Union[str, int]

The value to pad the sequences with.

required

Returns:

Type Description
List[List[Union[str, int]]

The padded sequences.

Source code in s4dd/smiles_utils.py
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
def pad_sequences(
    sequences: List[List[Union[str, int]]],
    padding_length: int,
    padding_value: Union[str, int],
) -> List[List[Union[str, int]]]:
    """Pad sequences to a given length. The padding is done at the end of the sequences.
    Longer sequences are truncated from the beginning.

    Parameters
    ----------
    sequences : List[List[Union[str, int]]
        A list of sequences, either tokenized or label encoded SMILES.
    padding_length : int
        The length to pad the sequences to.
    padding_value : Union[str, int]
        The value to pad the sequences with.

    Returns
    -------
    List[List[Union[str, int]]
        The padded sequences.
    """
    lens = [len(seq) for seq in sequences]
    diffs = [max(padding_length - len, 0) for len in lens]
    padded_sequences = [
        seq + [padding_value] * diff for seq, diff in zip(sequences, diffs)
    ]
    truncated_sequences = [seq[-padding_length:] for seq in padded_sequences]

    return truncated_sequences

segment_smiles(smiles, segment_sq_brackets=True)

Segment a SMILES string into tokens.

Parameters:

Name Type Description Default
smiles str

A SMILES string.

required
segment_sq_brackets bool

Whether to segment the square brackets "[" and "]" as tokens. The default is True.

True

Returns:

Type Description
List[str]

A list of tokens.

Source code in s4dd/smiles_utils.py
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
def segment_smiles(smiles: str, segment_sq_brackets=True) -> List[str]:
    """Segment a SMILES string into tokens.

    Parameters
    ----------
    smiles : str
        A SMILES string.
    segment_sq_brackets : bool
        Whether to segment the square brackets `"["` and `"]"` as tokens.
        The default is `True`.

    Returns
    -------
    List[str]
        A list of tokens.
    """
    regex = _RE_PATTERNS["segmentation_sq"]
    if not segment_sq_brackets:
        regex = _RE_PATTERNS["segmentation"]
    return regex.findall(smiles)

segment_smiles_batch(smiles_batch, segment_sq_brackets=True)

Segment a batch of SMILES strings into tokens.

Parameters:

Name Type Description Default
smiles_batch List[str]

A batch of SMILES strings.

required
segment_sq_brackets bool

Whether to segment the square brackets "[" and "]" as tokens. The default is True.

True

Returns:

Type Description
List[List[str]]

A list of lists of tokens.

Source code in s4dd/smiles_utils.py
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
def segment_smiles_batch(
    smiles_batch: List[str], segment_sq_brackets=True
) -> List[List[str]]:
    """Segment a batch of SMILES strings into tokens.

    Parameters
    ----------
    smiles_batch : List[str]
        A batch of SMILES strings.
    segment_sq_brackets : bool
        Whether to segment the square brackets `"["` and `"]"` as tokens.
        The default is `True`.

    Returns
    -------
    List[List[str]]
        A list of lists of tokens.
    """
    return [segment_smiles(smiles, segment_sq_brackets) for smiles in smiles_batch]