Source code for emoji.core

"""
emoji.core
~~~~~~~~~~

Core components for emoji.

"""

import re
import unicodedata
import sys
from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union

if sys.version_info < (3, 9):
    from typing_extensions import Literal, Match, TypedDict  # type: ignore
else:
    from typing import Literal, Match, TypedDict

from emoji import unicode_codes
from emoji.tokenizer import (
    Token,
    EmojiMatch,
    EmojiMatchZWJ,
    EmojiMatchZWJNonRGI,
    tokenize,
    filter_tokens,
)

__all__ = [
    'emojize',
    'demojize',
    'analyze',
    'config',
    'emoji_list',
    'distinct_emoji_list',
    'emoji_count',
    'replace_emoji',
    'is_emoji',
    'purely_emoji',
    'version',
    'Token',
    'EmojiMatch',
    'EmojiMatchZWJ',
    'EmojiMatchZWJNonRGI',
]

_DEFAULT_DELIMITER = ':'
# In Arabic language, the unicode character "\u0655" should be kept so we add it to the pattern below
_EMOJI_NAME_PATTERN = '\\w\\-&.’”“()!#*+,/«»\u0300\u0301\u0302\u0303\u0306\u0308\u030a\u0327\u064b\u064e\u064f\u0650\u0653\u0654\u3099\u30fb\u309a\u0655'


class _EmojiListReturn(TypedDict):
    emoji: str
    match_start: int
    match_end: int


[docs] class config: """Module-wide configuration""" demojize_keep_zwj = True """Change the behavior of :func:`emoji.demojize()` regarding zero-width-joiners (ZWJ/``\\u200D``) in emoji that are not "recommended for general interchange" (non-RGI). It has no effect on RGI emoji. For example this family emoji with different skin tones "👨‍👩🏿‍👧🏻‍👦🏾" contains four person emoji that are joined together by three ZWJ characters: ``👨\\u200D👩🏿\\u200D👧🏻\\u200D👦🏾`` If ``True``, the zero-width-joiners will be kept and :func:`emoji.emojize()` can reverse the :func:`emoji.demojize()` operation: ``emoji.emojize(emoji.demojize(s)) == s`` The example emoji would be converted to ``:man:\\u200d:woman_dark_skin_tone:\\u200d:girl_light_skin_tone:\\u200d:boy_medium-dark_skin_tone:`` If ``False``, the zero-width-joiners will be removed and :func:`emoji.emojize()` can only reverse the individual emoji: ``emoji.emojize(emoji.demojize(s)) != s`` The example emoji would be converted to ``:man::woman_dark_skin_tone::girl_light_skin_tone::boy_medium-dark_skin_tone:`` """ replace_emoji_keep_zwj = False """Change the behavior of :func:`emoji.replace_emoji()` regarding zero-width-joiners (ZWJ/``\\u200D``) in emoji that are not "recommended for general interchange" (non-RGI). It has no effect on RGI emoji. See :attr:`config.demojize_keep_zwj` for more information. """
[docs] @staticmethod def load_language(language: Union[List[str], str, None] = None): """Load one or multiple languages into memory. If no language is specified, all languages will be loaded. This makes language data accessible in the :data:`EMOJI_DATA` dict. For example to access a French emoji name, first load French with ``emoji.config.load_language('fr')`` and then access it with ``emoji.EMOJI_DATA['🏄']['fr']`` Available languages are listed in :data:`LANGUAGES`""" languages = ( [language] if isinstance(language, str) else language if language else unicode_codes.LANGUAGES ) for lang in languages: unicode_codes.load_from_json(lang)
[docs] def emojize( string: str, delimiters: Tuple[str, str] = (_DEFAULT_DELIMITER, _DEFAULT_DELIMITER), variant: Optional[Literal['text_type', 'emoji_type']] = None, language: str = 'en', version: Optional[float] = None, handle_version: Optional[Union[str, Callable[[str, Dict[str, str]], str]]] = None, ) -> str: """ Replace emoji names in a string with Unicode codes. >>> import emoji >>> print(emoji.emojize("Python is fun :thumbsup:", language='alias')) Python is fun 👍 >>> print(emoji.emojize("Python is fun :thumbs_up:")) Python is fun 👍 >>> print(emoji.emojize("Python is fun {thumbs_up}", delimiters = ("{", "}"))) Python is fun 👍 >>> print(emoji.emojize("Python is fun :red_heart:", variant="text_type")) Python is fun ❤ >>> print(emoji.emojize("Python is fun :red_heart:", variant="emoji_type")) Python is fun ❤️ # red heart, not black heart :param string: String contains emoji names. :param delimiters: (optional) Use delimiters other than _DEFAULT_DELIMITER. Each delimiter should contain at least one character that is not part of a-zA-Z0-9 and ``_-&.()!?#*+,``. See ``emoji.core._EMOJI_NAME_PATTERN`` for the regular expression of unsafe characters. :param variant: (optional) Choose variation selector between "base"(None), VS-15 ("text_type") and VS-16 ("emoji_type") :param language: Choose language of emoji name: language code 'es', 'de', etc. or 'alias' to use English aliases :param version: (optional) Max version. If set to an Emoji Version, all emoji above this version will be ignored. :param handle_version: (optional) Replace the emoji above ``version`` instead of ignoring it. handle_version can be either a string or a callable; If it is a callable, it's passed the Unicode emoji and the data dict from :data:`EMOJI_DATA` and must return a replacement string to be used:: handle_version('\\U0001F6EB', { 'en' : ':airplane_departure:', 'status' : fully_qualified, 'E' : 1, 'alias' : [':flight_departure:'], 'de': ':abflug:', 'es': ':avión_despegando:', ... }) :raises ValueError: if ``variant`` is neither None, 'text_type' or 'emoji_type' """ unicode_codes.load_from_json(language) pattern = re.compile( '(%s[%s]+%s)' % (re.escape(delimiters[0]), _EMOJI_NAME_PATTERN, re.escape(delimiters[1])) ) def replace(match: Match[str]) -> str: name = match.group(1)[len(delimiters[0]) : -len(delimiters[1])] emj = unicode_codes.get_emoji_by_name( _DEFAULT_DELIMITER + unicodedata.normalize('NFKC', name) + _DEFAULT_DELIMITER, language, ) if emj is None: return match.group(1) if version is not None and unicode_codes.EMOJI_DATA[emj]['E'] > version: if callable(handle_version): emj_data = unicode_codes.EMOJI_DATA[emj].copy() emj_data['match_start'] = match.start() emj_data['match_end'] = match.end() return handle_version(emj, emj_data) elif handle_version is not None: return str(handle_version) else: return '' if variant is None or 'variant' not in unicode_codes.EMOJI_DATA[emj]: return emj if emj[-1] == '\ufe0e' or emj[-1] == '\ufe0f': # Remove an existing variant emj = emj[0:-1] if variant == 'text_type': return emj + '\ufe0e' elif variant == 'emoji_type': return emj + '\ufe0f' else: raise ValueError( "Parameter 'variant' must be either None, 'text_type' or 'emoji_type'" ) return pattern.sub(replace, string)
[docs] def analyze( string: str, non_emoji: bool = False, join_emoji: bool = True ) -> Iterator[Token]: """ Find unicode emoji in a string. Yield each emoji as a named tuple :class:`Token` ``(chars, EmojiMatch)`` or :class:`Token` ``(chars, EmojiMatchZWJNonRGI)``. If ``non_emoji`` is True, also yield all other characters as :class:`Token` ``(char, char)`` . :param string: String to analyze :param non_emoji: If True also yield all non-emoji characters as Token(char, char) :param join_emoji: If True, multiple EmojiMatch are merged into a single EmojiMatchZWJNonRGI if they are separated only by a ZWJ. """ return filter_tokens( tokenize(string, keep_zwj=True), emoji_only=not non_emoji, join_emoji=join_emoji )
[docs] def demojize( string: str, delimiters: Tuple[str, str] = (_DEFAULT_DELIMITER, _DEFAULT_DELIMITER), language: str = 'en', version: Optional[float] = None, handle_version: Optional[Union[str, Callable[[str, Dict[str, str]], str]]] = None, ) -> str: """ Replace Unicode emoji in a string with emoji shortcodes. Useful for storage. >>> import emoji >>> print(emoji.emojize("Python is fun :thumbs_up:")) Python is fun 👍 >>> print(emoji.demojize("Python is fun 👍")) Python is fun :thumbs_up: >>> print(emoji.demojize("icode is tricky 😯", delimiters=("__", "__"))) Unicode is tricky __hushed_face__ :param string: String contains Unicode characters. MUST BE UNICODE. :param delimiters: (optional) User delimiters other than ``_DEFAULT_DELIMITER`` :param language: Choose language of emoji name: language code 'es', 'de', etc. or 'alias' to use English aliases :param version: (optional) Max version. If set to an Emoji Version, all emoji above this version will be removed. :param handle_version: (optional) Replace the emoji above ``version`` instead of removing it. handle_version can be either a string or a callable ``handle_version(emj: str, data: dict) -> str``; If it is a callable, it's passed the Unicode emoji and the data dict from :data:`EMOJI_DATA` and must return a replacement string to be used. The passed data is in the form of:: handle_version('\\U0001F6EB', { 'en' : ':airplane_departure:', 'status' : fully_qualified, 'E' : 1, 'alias' : [':flight_departure:'], 'de': ':abflug:', 'es': ':avión_despegando:', ... }) """ if language == 'alias': language = 'en' _use_aliases = True else: _use_aliases = False unicode_codes.load_from_json(language) def handle(emoji_match: EmojiMatch) -> str: assert emoji_match.data is not None if version is not None and emoji_match.data['E'] > version: if callable(handle_version): return handle_version(emoji_match.emoji, emoji_match.data_copy()) elif handle_version is not None: return handle_version else: return '' elif language in emoji_match.data: if _use_aliases and 'alias' in emoji_match.data: return ( delimiters[0] + emoji_match.data['alias'][0][1:-1] + delimiters[1] ) else: return delimiters[0] + emoji_match.data[language][1:-1] + delimiters[1] else: # The emoji exists, but it is not translated, so we keep the emoji return emoji_match.emoji matches = tokenize(string, keep_zwj=config.demojize_keep_zwj) return ''.join( str(handle(token.value)) if isinstance(token.value, EmojiMatch) else token.value for token in matches )
[docs] def replace_emoji( string: str, replace: Union[str, Callable[[str, Dict[str, str]], str]] = '', version: float = -1, ) -> str: """ Replace Unicode emoji in a customizable string. :param string: String contains Unicode characters. MUST BE UNICODE. :param replace: (optional) replace can be either a string or a callable; If it is a callable, it's passed the Unicode emoji and the data dict from :data:`EMOJI_DATA` and must return a replacement string to be used. replace(str, dict) -> str :param version: (optional) Max version. If set to an Emoji Version, only emoji above this version will be replaced. """ def handle(emoji_match: EmojiMatch) -> str: if version > -1: assert emoji_match.data is not None if emoji_match.data['E'] > version: if callable(replace): return replace(emoji_match.emoji, emoji_match.data_copy()) else: return str(replace) elif callable(replace): return replace(emoji_match.emoji, emoji_match.data_copy()) elif replace is not None: # type: ignore return replace return emoji_match.emoji matches = tokenize(string, keep_zwj=config.replace_emoji_keep_zwj) if config.replace_emoji_keep_zwj: matches = filter_tokens(matches, emoji_only=False, join_emoji=True) return ''.join( str(handle(m.value)) if isinstance(m.value, EmojiMatch) else m.value for m in matches )
[docs] def emoji_list(string: str) -> List[_EmojiListReturn]: """ Returns the location and emoji in list of dict format. >>> emoji.emoji_list("Hi, I am fine. 😁") [{'match_start': 15, 'match_end': 16, 'emoji': '😁'}] """ return [ { 'match_start': m.value.start, 'match_end': m.value.end, 'emoji': m.value.emoji, } for m in tokenize(string, keep_zwj=False) if isinstance(m.value, EmojiMatch) ]
[docs] def distinct_emoji_list(string: str) -> List[str]: """Returns distinct list of emojis from the string.""" distinct_list = list({e['emoji'] for e in emoji_list(string)}) return distinct_list
[docs] def emoji_count(string: str, unique: bool = False) -> int: """ Returns the count of emojis in a string. :param unique: (optional) True if count only unique emojis """ if unique: return len(distinct_emoji_list(string)) return len(emoji_list(string))
[docs] def is_emoji(string: str) -> bool: """ Returns True if the string is a single emoji, and it is "recommended for general interchange" by Unicode.org. """ return string in unicode_codes.EMOJI_DATA
[docs] def purely_emoji(string: str) -> bool: """ Returns True if the string contains only emojis. This might not imply that `is_emoji` for all the characters, for example, if the string contains variation selectors. """ return all(isinstance(m.value, EmojiMatch) for m in analyze(string, non_emoji=True))
[docs] def version(string: str) -> float: """ Returns the Emoji Version of the emoji. See https://www.unicode.org/reports/tr51/#Versioning for more information. >>> emoji.version("😁") 0.6 >>> emoji.version(":butterfly:") 3 :param string: An emoji or a text containing an emoji :raises ValueError: if ``string`` does not contain an emoji """ # Try dictionary lookup if string in unicode_codes.EMOJI_DATA: return unicode_codes.EMOJI_DATA[string]['E'] # Try name lookup emj_code = unicode_codes.get_emoji_by_name(string, 'en') if emj_code and emj_code in unicode_codes.EMOJI_DATA: return unicode_codes.EMOJI_DATA[emj_code]['E'] # Try to find first emoji in string version: List[float] = [] def f(e: str, emoji_data: Dict[str, Any]) -> str: version.append(emoji_data['E']) return '' replace_emoji(string, replace=f, version=-1) if version: return version[0] emojize(string, language='alias', version=-1, handle_version=f) if version: return version[0] for lang_code in unicode_codes.LANGUAGES: emojize(string, language=lang_code, version=-1, handle_version=f) if version: return version[0] raise ValueError('No emoji found in string')