class sanitize_identifier_fn
A parameterized function class that sanitizes strings (group/label values) to make them safe for use as Python attribute names in AttrTree structures by converting special characters to their unicode names and applying transformations.
/tf/active/vicechatdev/patches/util.py
586 - 789
complex
Purpose
This class converts arbitrary strings containing special characters, unicode, and whitespace into valid Python identifiers suitable for attribute access. It uses unicode character names (e.g., '$' becomes 'dollar_sign'), applies customizable filtering, substitutions, and transformations to shorten names, and maintains a lookup cache for performance. The class is designed to handle internationalization while ensuring the resulting identifiers are valid Python attribute names.
Source Code
class sanitize_identifier_fn(param.ParameterizedFunction):
"""
Sanitizes group/label values for use in AttrTree attribute
access.
Special characters are sanitized using their (lowercase) unicode
name using the unicodedata module. For instance:
>>> unicodedata.name(u'$').lower()
'dollar sign'
As these names are often very long, this parameterized function
allows filtered, substitutions and transforms to help shorten these
names appropriately.
"""
capitalize = param.Boolean(default=True, doc="""
Whether the first letter should be converted to
uppercase. Note, this will only be applied to ASCII characters
in order to make sure paths aren't confused with method
names.""")
eliminations = param.List(['extended', 'accent', 'small', 'letter', 'sign', 'digit',
'latin', 'greek', 'arabic-indic', 'with', 'dollar'], doc="""
Lowercase strings to be eliminated from the unicode names in
order to shorten the sanitized name ( lowercase). Redundant
strings should be removed but too much elimination could cause
two unique strings to map to the same sanitized output.""")
substitutions = param.Dict(default={'circumflex':'power',
'asterisk':'times',
'solidus':'over'}, doc="""
Lowercase substitutions of substrings in unicode names. For
instance the ^ character has the name 'circumflex accent' even
though it is more typically used for exponentiation. Note that
substitutions occur after filtering and that there should be no
ordering dependence between substitutions.""")
transforms = param.List(default=[capitalize_unicode_name], doc="""
List of string transformation functions to apply after
filtering and substitution in order to further compress the
unicode name. For instance, the default capitalize_unicode_name
function will turn the string "capital delta" into "Delta".""")
disallowed = param.List(default=['trait_names', '_ipython_display_',
'_getAttributeNames'], doc="""
An explicit list of name that should not be allowed as
attribute names on Tree objects.
By default, prevents IPython from creating an entry called
Trait_names due to an inconvenient getattr check (during
tab-completion).""")
disable_leading_underscore = param.Boolean(default=False, doc="""
Whether leading underscores should be allowed to be sanitized
with the leading prefix.""")
aliases = param.Dict(default={}, doc="""
A dictionary of aliases mapping long strings to their short,
sanitized equivalents""")
prefix = 'A_'
_lookup_table = param.Dict(default={}, doc="""
Cache of previously computed sanitizations""")
@param.parameterized.bothmethod
def add_aliases(self_or_cls, **kwargs):
"""
Conveniently add new aliases as keyword arguments. For instance
you can add a new alias with add_aliases(short='Longer string')
"""
self_or_cls.aliases.update({v:k for k,v in kwargs.items()})
@param.parameterized.bothmethod
def remove_aliases(self_or_cls, aliases):
"""
Remove a list of aliases.
"""
for k,v in self_or_cls.aliases.items():
if v in aliases:
self_or_cls.aliases.pop(k)
@param.parameterized.bothmethod
def allowable(self_or_cls, name, disable_leading_underscore=None):
disabled_reprs = ['javascript', 'jpeg', 'json', 'latex',
'latex', 'pdf', 'png', 'svg', 'markdown']
disabled_ = (self_or_cls.disable_leading_underscore
if disable_leading_underscore is None
else disable_leading_underscore)
if disabled_ and name.startswith('_'):
return False
isrepr = any(('_repr_%s_' % el) == name for el in disabled_reprs)
return (name not in self_or_cls.disallowed) and not isrepr
@param.parameterized.bothmethod
def prefixed(self, identifier):
"""
Whether or not the identifier will be prefixed.
Strings that require the prefix are generally not recommended.
"""
invalid_starting = ['Mn', 'Mc', 'Nd', 'Pc']
if identifier.startswith('_'): return True
return unicodedata.category(identifier[0]) in invalid_starting
@param.parameterized.bothmethod
def remove_diacritics(self_or_cls, identifier):
"""
Remove diacritics and accents from the input leaving other
unicode characters alone."""
chars = ''
for c in identifier:
replacement = unicodedata.normalize('NFKD', c).encode('ASCII', 'ignore')
if replacement != '':
chars += bytes_to_unicode(replacement)
else:
chars += c
return chars
@param.parameterized.bothmethod
def shortened_character_name(self_or_cls, c, eliminations=[], substitutions={}, transforms=[]):
"""
Given a unicode character c, return the shortened unicode name
(as a list of tokens) by applying the eliminations,
substitutions and transforms.
"""
name = unicodedata.name(c).lower()
# Filtering
for elim in eliminations:
name = name.replace(elim, '')
# Substitution
for i,o in substitutions.items():
name = name.replace(i, o)
for transform in transforms:
name = transform(name)
return ' '.join(name.strip().split()).replace(' ','_').replace('-','_')
def __call__(self, name, escape=True):
if name in [None, '']:
return name
elif name in self.aliases:
return self.aliases[name]
elif name in self._lookup_table:
return self._lookup_table[name]
name = bytes_to_unicode(name)
if not self.allowable(name):
raise AttributeError("String %r is in the disallowed list of attribute names: %r" % (name, self.disallowed))
if self.capitalize and name and name[0] in string.ascii_lowercase:
name = name[0].upper()+name[1:]
sanitized = self.sanitize_py3(name)
if self.prefixed(name):
sanitized = self.prefix + sanitized
self._lookup_table[name] = sanitized
return sanitized
def _process_underscores(self, tokens):
"Strip underscores to make sure the number is correct after join"
groups = [[str(''.join(el))] if b else list(el)
for (b,el) in itertools.groupby(tokens, lambda k: k=='_')]
flattened = [el for group in groups for el in group]
processed = []
for token in flattened:
if token == '_': continue
if token.startswith('_'):
token = str(token[1:])
if token.endswith('_'):
token = str(token[:-1])
processed.append(token)
return processed
def sanitize_py3(self, name):
if not name.isidentifier():
return '_'.join(self.sanitize(name, lambda c: ('_'+c).isidentifier()))
else:
return name
def sanitize(self, name, valid_fn):
"Accumulate blocks of hex and separate blocks by underscores"
invalid = {'\a':'a','\b':'b', '\v':'v','\f':'f','\r':'r'}
for cc in filter(lambda el: el in name, invalid.keys()):
raise Exception(r"Please use a raw string or escape control code '\%s'"
% invalid[cc])
sanitized, chars = [], ''
for split in name.split():
for c in split:
if valid_fn(c): chars += str(c) if c=='_' else c
else:
short = self.shortened_character_name(c, self.eliminations,
self.substitutions,
self.transforms)
sanitized.extend([chars] if chars else [])
if short != '':
sanitized.append(short)
chars = ''
if chars:
sanitized.extend([chars])
chars=''
return self._process_underscores(sanitized + ([chars] if chars else []))
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
param.ParameterizedFunction | - |
Parameter Details
capitalize: Boolean (default: True) - Whether to capitalize the first ASCII letter of the sanitized identifier to distinguish it from method names
eliminations: List of strings (default: ['extended', 'accent', 'small', 'letter', 'sign', 'digit', 'latin', 'greek', 'arabic-indic', 'with', 'dollar']) - Lowercase substrings to remove from unicode character names to shorten the output
substitutions: Dictionary (default: {'circumflex':'power', 'asterisk':'times', 'solidus':'over'}) - Mapping of unicode name substrings to shorter alternatives for common mathematical symbols
transforms: List of functions (default: [capitalize_unicode_name]) - String transformation functions applied after filtering and substitution to further compress unicode names
disallowed: List of strings (default: ['trait_names', '_ipython_display_', '_getAttributeNames']) - Explicit list of names that should not be allowed as attribute names, primarily to prevent IPython conflicts
disable_leading_underscore: Boolean (default: False) - Whether to disallow sanitization of strings starting with underscores by adding the prefix
aliases: Dictionary (default: {}) - Mapping of long strings to their short, pre-defined sanitized equivalents for common cases
prefix: String (default: 'A_') - Prefix added to identifiers that would otherwise be invalid (e.g., starting with digits or underscores)
_lookup_table: Dictionary (default: {}) - Internal cache storing previously computed sanitizations for performance optimization
Return Value
When called as a function (via __call__), returns a sanitized string that is a valid Python identifier. Returns None or empty string unchanged. For aliased names, returns the alias. For cached names, returns the cached value. Otherwise, returns a newly sanitized identifier, potentially prefixed if it starts with invalid characters. Methods like add_aliases and remove_aliases return None (modify state in place). Methods like allowable return boolean values. Methods like shortened_character_name return string tokens.
Class Interface
Methods
__call__(self, name: str, escape: bool = True) -> str
Purpose: Main method to sanitize a string into a valid Python identifier
Parameters:
name: The string to sanitize (can be None or empty string)escape: Whether to apply escaping (parameter present but not actively used in implementation)
Returns: A sanitized string suitable for use as a Python attribute name, or None/empty string if input was None/empty
add_aliases(self_or_cls, **kwargs) -> None
Purpose: Conveniently add new aliases as keyword arguments where key is the short form and value is the long form
Parameters:
kwargs: Keyword arguments where keys are short aliases and values are the long strings they represent
Returns: None - modifies the aliases dictionary in place
remove_aliases(self_or_cls, aliases: list) -> None
Purpose: Remove a list of aliases from the aliases dictionary
Parameters:
aliases: List of alias values (short forms) to remove
Returns: None - modifies the aliases dictionary in place
allowable(self_or_cls, name: str, disable_leading_underscore: bool = None) -> bool
Purpose: Check if a name is allowed as an attribute name (not in disallowed list and not a special IPython/Jupyter representation method)
Parameters:
name: The name to checkdisable_leading_underscore: Override for the class-level disable_leading_underscore setting
Returns: True if the name is allowable, False otherwise
prefixed(self, identifier: str) -> bool
Purpose: Determine whether an identifier will require the prefix (starts with underscore or invalid unicode category)
Parameters:
identifier: The identifier string to check
Returns: True if the identifier will be prefixed, False otherwise
remove_diacritics(self_or_cls, identifier: str) -> str
Purpose: Remove diacritics and accents from the input string while preserving other unicode characters
Parameters:
identifier: The string from which to remove diacritics
Returns: String with diacritics removed using NFKD normalization
shortened_character_name(self_or_cls, c: str, eliminations: list = [], substitutions: dict = {}, transforms: list = []) -> str
Purpose: Convert a single unicode character to its shortened unicode name by applying eliminations, substitutions, and transforms
Parameters:
c: A single unicode charactereliminations: List of substrings to eliminate from the unicode namesubstitutions: Dictionary of substring replacementstransforms: List of transformation functions to apply
Returns: Shortened unicode name as a string with spaces replaced by underscores
sanitize_py3(self, name: str) -> str
Purpose: Python 3 specific sanitization using the isidentifier() method
Parameters:
name: The string to sanitize
Returns: Sanitized identifier string, either the original if already valid or processed version
sanitize(self, name: str, valid_fn: callable) -> list
Purpose: Core sanitization logic that processes each character and builds sanitized tokens
Parameters:
name: The string to sanitizevalid_fn: Function that returns True if a character is valid
Returns: List of sanitized token strings
_process_underscores(self, tokens: list) -> list
Purpose: Internal method to clean up underscores in token list, removing leading/trailing underscores from tokens
Parameters:
tokens: List of string tokens that may contain underscores
Returns: Processed list of tokens with cleaned underscores
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
capitalize |
param.Boolean | Whether to capitalize the first ASCII letter of sanitized identifiers | instance |
eliminations |
param.List | List of lowercase strings to eliminate from unicode character names | instance |
substitutions |
param.Dict | Dictionary mapping unicode name substrings to shorter alternatives | instance |
transforms |
param.List | List of transformation functions to apply to unicode names | instance |
disallowed |
param.List | List of names explicitly disallowed as attribute names | instance |
disable_leading_underscore |
param.Boolean | Whether to prevent sanitization of strings with leading underscores | instance |
aliases |
param.Dict | Dictionary mapping long strings to their short sanitized equivalents | instance |
prefix |
str | Prefix string added to identifiers that would otherwise be invalid (default: 'A_') | class |
_lookup_table |
param.Dict | Internal cache of previously computed sanitizations for performance | instance |
Dependencies
paramunicodedatastringitertools
Required Imports
import param
import unicodedata
import string
import itertools
Usage Example
# Instantiate the sanitizer
sanitizer = sanitize_identifier_fn()
# Sanitize a string with special characters
result = sanitizer('My $Variable')
print(result) # Output: 'My_Dollar_Variable'
# Add custom aliases for common strings
sanitizer.add_aliases(MyVar='My Long Variable Name')
result = sanitizer('My Long Variable Name')
print(result) # Output: 'MyVar'
# Check if a name is allowable
if sanitizer.allowable('_private'):
print('Allowed')
# Use as class method
result = sanitize_identifier_fn.instance()('Test@123')
# Customize behavior
custom_sanitizer = sanitize_identifier_fn(
capitalize=False,
eliminations=['sign'],
substitutions={'asterisk': 'star'}
)
result = custom_sanitizer('value*2')
print(result)
Best Practices
- Use the same sanitizer instance consistently across your application to benefit from the lookup cache
- Add aliases for frequently used long strings to improve performance and readability
- Be cautious when modifying eliminations and substitutions as they can cause collisions where different inputs map to the same output
- The class can be used as both an instance and via class methods using @param.parameterized.bothmethod decorator
- Leading underscores are prefixed by default to avoid conflicts with Python's name mangling; use disable_leading_underscore carefully
- The sanitizer maintains state through the _lookup_table cache, so reusing instances is more efficient than creating new ones
- Raw strings should be used for input containing control characters to avoid exceptions
- The prefixed() method can be used to check if an identifier will require the 'A_' prefix before sanitization
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
function capitalize_unicode_name 59.1% similar
-
function sanitize_folders 53.4% similar
-
function tree_attribute 51.3% similar
-
function _sanitize_properties 51.0% similar
-
function sanitize_filename 49.5% similar