Source code for emoji.core

"""
emoji.core
~~~~~~~~~~

Core components for emoji.

"""

import re
import unicodedata
from typing import Iterator

from emoji import unicode_codes
from emoji.tokenizer import Token, EmojiMatch, EmojiMatchZWJ, EmojiMatchZWJNonRGI, tokenize, filter_tokens

__all__ = [
    'emojize', 'demojize', 'analyze', 'config',
    'emoji_list', 'distinct_emoji_list', 'emoji_count',
    'replace_emoji', 'is_emoji', 'purely_emoji', 'version',
    'Token', 'EmojiMatch', 'EmojiMatchZWJ', 'EmojiMatchZWJNonRGI',
]

_DEFAULT_DELIMITER = ':'
# In Arabic language, the unicode character "\u0655" should be kept so we add it to the pattern below
_EMOJI_NAME_PATTERN = '\\w\\-&.’”“()!#*+,/«»\u0300\u0301\u0302\u0303\u0306\u0308\u030a\u0327\u064b\u064e\u064f\u0650\u0653\u0654\u3099\u30fb\u309a\u0655'


[docs] class config(): """Module-wide configuration""" demojize_keep_zwj = True """Change the behavior of :func:`emoji.demojize()` regarding zero-width-joiners (ZWJ/``\\u200D``) in emoji that are not "recommended for general interchange" (non-RGI). It has no effect on RGI emoji. For example this family emoji with different skin tones "👨‍👩🏿‍👧🏻‍👦🏾" contains four person emoji that are joined together by three ZWJ characters: ``👨\\u200D👩🏿\\u200D👧🏻\\u200D👦🏾`` If ``True``, the zero-width-joiners will be kept and :func:`emoji.emojize()` can reverse the :func:`emoji.demojize()` operation: ``emoji.emojize(emoji.demojize(s)) == s`` The example emoji would be converted to ``:man:\\u200d:woman_dark_skin_tone:\\u200d:girl_light_skin_tone:\\u200d:boy_medium-dark_skin_tone:`` If ``False``, the zero-width-joiners will be removed and :func:`emoji.emojize()` can only reverse the individual emoji: ``emoji.emojize(emoji.demojize(s)) != s`` The example emoji would be converted to ``:man::woman_dark_skin_tone::girl_light_skin_tone::boy_medium-dark_skin_tone:`` """ replace_emoji_keep_zwj = False """Change the behavior of :func:`emoji.replace_emoji()` regarding zero-width-joiners (ZWJ/``\\u200D``) in emoji that are not "recommended for general interchange" (non-RGI). It has no effect on RGI emoji. See :attr:`config.demojize_keep_zwj` for more information. """
[docs] def emojize( string, delimiters=(_DEFAULT_DELIMITER, _DEFAULT_DELIMITER), variant=None, language='en', version=None, handle_version=None ): """ Replace emoji names in a string with Unicode codes. >>> import emoji >>> print(emoji.emojize("Python is fun :thumbsup:", language='alias')) Python is fun 👍 >>> print(emoji.emojize("Python is fun :thumbs_up:")) Python is fun 👍 >>> print(emoji.emojize("Python is fun {thumbs_up}", delimiters = ("{", "}"))) Python is fun 👍 >>> print(emoji.emojize("Python is fun :red_heart:", variant="text_type")) Python is fun ❤ >>> print(emoji.emojize("Python is fun :red_heart:", variant="emoji_type")) Python is fun ❤️ # red heart, not black heart :param string: String contains emoji names. :param delimiters: (optional) Use delimiters other than _DEFAULT_DELIMITER. Each delimiter should contain at least one character that is not part of a-zA-Z0-9 and ``_-&.()!?#*+,``. See ``emoji.core._EMOJI_NAME_PATTERN`` for the regular expression of unsafe characters. :param variant: (optional) Choose variation selector between "base"(None), VS-15 ("text_type") and VS-16 ("emoji_type") :param language: Choose language of emoji name: language code 'es', 'de', etc. or 'alias' to use English aliases :param version: (optional) Max version. If set to an Emoji Version, all emoji above this version will be ignored. :param handle_version: (optional) Replace the emoji above ``version`` instead of ignoring it. handle_version can be either a string or a callable; If it is a callable, it's passed the Unicode emoji and the data dict from :data:`EMOJI_DATA` and must return a replacement string to be used:: handle_version('\\U0001F6EB', { 'en' : ':airplane_departure:', 'status' : fully_qualified, 'E' : 1, 'alias' : [':flight_departure:'], 'de': ':abflug:', 'es': ':avión_despegando:', ... }) :raises ValueError: if ``variant`` is neither None, 'text_type' or 'emoji_type' """ if language == 'alias': language_pack = unicode_codes.get_aliases_unicode_dict() else: language_pack = unicode_codes.get_emoji_unicode_dict(language) pattern = re.compile('(%s[%s]+%s)' % (re.escape(delimiters[0]), _EMOJI_NAME_PATTERN, re.escape(delimiters[1]))) def replace(match): name = match.group(1)[len(delimiters[0]):-len(delimiters[1])] emj = language_pack.get( _DEFAULT_DELIMITER + unicodedata.normalize('NFKC', name) + _DEFAULT_DELIMITER) if emj is None: return match.group(1) if version is not None and unicode_codes.EMOJI_DATA[emj]['E'] > version: if callable(handle_version): emj_data = unicode_codes.EMOJI_DATA[emj].copy() emj_data['match_start'] = match.start() emj_data['match_end'] = match.end() return handle_version(emj, emj_data) elif handle_version is not None: return str(handle_version) else: return '' if variant is None or 'variant' not in unicode_codes.EMOJI_DATA[emj]: return emj if emj[-1] == '\uFE0E' or emj[-1] == '\uFE0F': # Remove an existing variant emj = emj[0:-1] if variant == "text_type": return emj + '\uFE0E' elif variant == "emoji_type": return emj + '\uFE0F' else: raise ValueError( "Parameter 'variant' must be either None, 'text_type' or 'emoji_type'") return pattern.sub(replace, string)
[docs] def analyze(string: str, non_emoji: bool = False, join_emoji: bool = True) -> Iterator[Token]: """ Find unicode emoji in a string. Yield each emoji as a named tuple :class:`Token` ``(chars, EmojiMatch)`` or `:class:`Token` ``(chars, EmojiMatchZWJNonRGI)``. If ``non_emoji`` is True, also yield all other characters as :class:`Token` ``(char, char)`` . :param string: String to analyze :param non_emoji: If True also yield all non-emoji characters as Token(char, char) :param join_emoji: If True, multiple EmojiMatch are merged into a single EmojiMatchZWJNonRGI if they are separated only by a ZWJ. """ return filter_tokens( tokenize(string, keep_zwj=True), emoji_only=not non_emoji, join_emoji=join_emoji)
[docs] def demojize( string, delimiters=(_DEFAULT_DELIMITER, _DEFAULT_DELIMITER), language='en', version=None, handle_version=None ): """ Replace Unicode emoji in a string with emoji shortcodes. Useful for storage. >>> import emoji >>> print(emoji.emojize("Python is fun :thumbs_up:")) Python is fun 👍 >>> print(emoji.demojize("Python is fun 👍")) Python is fun :thumbs_up: >>> print(emoji.demojize("icode is tricky 😯", delimiters=("__", "__"))) Unicode is tricky __hushed_face__ :param string: String contains Unicode characters. MUST BE UNICODE. :param delimiters: (optional) User delimiters other than ``_DEFAULT_DELIMITER`` :param language: Choose language of emoji name: language code 'es', 'de', etc. or 'alias' to use English aliases :param version: (optional) Max version. If set to an Emoji Version, all emoji above this version will be removed. :param handle_version: (optional) Replace the emoji above ``version`` instead of removing it. handle_version can be either a string or a callable ``handle_version(emj: str, data: dict) -> str``; If it is a callable, it's passed the Unicode emoji and the data dict from :data:`EMOJI_DATA` and must return a replacement string to be used. The passed data is in the form of:: handle_version('\\U0001F6EB', { 'en' : ':airplane_departure:', 'status' : fully_qualified, 'E' : 1, 'alias' : [':flight_departure:'], 'de': ':abflug:', 'es': ':avión_despegando:', ... }) """ if language == 'alias': language = 'en' _use_aliases = True else: _use_aliases = False def handle(emoji_match): if version is not None and emoji_match.data['E'] > version: if callable(handle_version): return handle_version(emoji_match.emoji, emoji_match.data_copy()) elif handle_version is not None: return handle_version else: return '' elif language in emoji_match.data: if _use_aliases and 'alias' in emoji_match.data: return delimiters[0] + emoji_match.data['alias'][0][1:-1] + delimiters[1] else: return delimiters[0] + emoji_match.data[language][1:-1] + delimiters[1] else: # The emoji exists, but it is not translated, so we keep the emoji return emoji_match.emoji matches = tokenize(string, keep_zwj=config.demojize_keep_zwj) return "".join(str(handle(token.value)) if isinstance( token.value, EmojiMatch) else token.value for token in matches)
[docs] def replace_emoji(string, replace='', version=-1): """ Replace Unicode emoji in a customizable string. :param string: String contains Unicode characters. MUST BE UNICODE. :param replace: (optional) replace can be either a string or a callable; If it is a callable, it's passed the Unicode emoji and the data dict from :data:`EMOJI_DATA` and must return a replacement string to be used. replace(str, dict) -> str :param version: (optional) Max version. If set to an Emoji Version, only emoji above this version will be replaced. """ def handle(emoji_match): if version > -1: if emoji_match.data['E'] > version: if callable(replace): return replace(emoji_match.emoji, emoji_match.data_copy()) else: return str(replace) elif callable(replace): return replace(emoji_match.emoji, emoji_match.data_copy()) elif replace is not None: return replace return emoji_match.emoji matches = tokenize(string, keep_zwj=config.replace_emoji_keep_zwj) if config.replace_emoji_keep_zwj: matches = filter_tokens( matches, emoji_only=False, join_emoji=True) return "".join(str(handle(m.value)) if isinstance( m.value, EmojiMatch) else m.value for m in matches)
[docs] def emoji_list(string): """ Returns the location and emoji in list of dict format. >>> emoji.emoji_list("Hi, I am fine. 😁") [{'match_start': 15, 'match_end': 16, 'emoji': '😁'}] """ return [{ 'match_start': m.value.start, 'match_end': m.value.end, 'emoji': m.value.emoji, } for m in tokenize(string, keep_zwj=False) if isinstance(m.value, EmojiMatch)]
[docs] def distinct_emoji_list(string): """Returns distinct list of emojis from the string.""" distinct_list = list( {e['emoji'] for e in emoji_list(string)} ) return distinct_list
[docs] def emoji_count(string, unique=False): """ Returns the count of emojis in a string. :param unique: (optional) True if count only unique emojis """ if unique: return len(distinct_emoji_list(string)) return len(emoji_list(string))
[docs] def is_emoji(string): """ Returns True if the string is a single emoji, and it is "recommended for general interchange" by Unicode.org. """ return string in unicode_codes.EMOJI_DATA
[docs] def purely_emoji(string: str) -> bool: """ Returns True if the string contains only emojis. This might not imply that `is_emoji` for all the characters, for example, if the string contains variation selectors. """ return all(isinstance(m.value, EmojiMatch) for m in analyze(string, non_emoji=True))
[docs] def version(string): """ Returns the Emoji Version of the emoji. See https://www.unicode.org/reports/tr51/#Versioning for more information. >>> emoji.version("😁") 0.6 >>> emoji.version(":butterfly:") 3 :param string: An emoji or a text containing an emoji :raises ValueError: if ``string`` does not contain an emoji """ # Try dictionary lookup if string in unicode_codes.EMOJI_DATA: return unicode_codes.EMOJI_DATA[string]['E'] language_pack = unicode_codes.get_emoji_unicode_dict('en') if string in language_pack: emj_code = language_pack[string] if emj_code in unicode_codes.EMOJI_DATA: return unicode_codes.EMOJI_DATA[emj_code]['E'] # Try to find first emoji in string version = [] def f(e, emoji_data): version.append(emoji_data['E']) return '' replace_emoji(string, replace=f, version=-1) if version: return version[0] emojize(string, language='alias', version=-1, handle_version=f) if version: return version[0] for lang_code in unicode_codes._EMOJI_UNICODE: emojize(string, language=lang_code, version=-1, handle_version=f) if version: return version[0] raise ValueError("No emoji found in string")