Source code for emoji.tokenizer

"""
emoji.tokenizer
~~~~~~~~~~~~~~~

Components for detecting and tokenizing emoji in strings.

"""
from typing import NamedTuple, Dict, Union, Iterator, Any
from emoji import unicode_codes


__all__ = [
    'EmojiMatch', 'EmojiMatchZWJ', 'EmojiMatchZWJNonRGI', 'Token',
    'tokenize', 'filter_tokens',
]

_ZWJ = '\u200D'
_SEARCH_TREE = None


[docs] class EmojiMatch: """ Represents a match of a "recommended for general interchange" (RGI) emoji in a string. """ __slots__ = ('emoji', 'start', 'end', 'data') def __init__(self, emoji: str, start: int, end: int, data: Union[dict, None]): self.emoji = emoji """The emoji substring""" self.start = start """The start index of the match in the string""" self.end = end """The end index of the match in the string""" self.data = data """The entry from :data:`EMOJI_DATA` for this emoji or ``None`` if the emoji is non-RGI"""
[docs] def data_copy(self) -> Dict[str, Any]: """ Returns a copy of the data from :data:`EMOJI_DATA` for this match with the additional keys ``match_start`` and ``match_end``. """ if self.data: emj_data = self.data.copy() emj_data['match_start'] = self.start emj_data['match_end'] = self.end return emj_data else: return { 'match_start': self.start, 'match_end': self.end }
[docs] def is_zwj(self) -> bool: """ Checks if this is a ZWJ-emoji. :returns: True if this is a ZWJ-emoji, False otherwise """ return _ZWJ in self.emoji
[docs] def split(self) -> Union['EmojiMatchZWJ', 'EmojiMatch']: """ Splits a ZWJ-emoji into its constituents. :returns: An :class:`EmojiMatchZWJ` containing the "sub-emoji" if this is a ZWJ-emoji, otherwise self """ if self.is_zwj(): return EmojiMatchZWJ(self) else: return self
def __repr__(self) -> str: return f'{self.__class__.__name__}({self.emoji}, {self.start}:{self.end})'
[docs] class EmojiMatchZWJ(EmojiMatch): """ Represents a match of multiple emoji in a string that were joined by zero-width-joiners (ZWJ/``\\u200D``).""" __slots__ = ('emojis', ) def __init__(self, match: EmojiMatch): super().__init__(match.emoji, match.start, match.end, match.data) self.emojis = [] """List of sub emoji as EmojiMatch objects""" i = match.start for e in match.emoji.split(_ZWJ): m = EmojiMatch( e, i, i+len(e), unicode_codes.EMOJI_DATA.get(e, None)) self.emojis.append(m) i += len(e) + 1
[docs] def join(self) -> str: """ Joins a ZWJ-emoji into a string """ return _ZWJ.join(e.emoji for e in self.emojis)
[docs] def is_zwj(self) -> bool: return True
[docs] def split(self) -> 'EmojiMatchZWJ': return self
def __repr__(self) -> str: return f'{self.__class__.__name__}({self.join()}, {self.start}:{self.end})'
[docs] class EmojiMatchZWJNonRGI(EmojiMatchZWJ): """ Represents a match of multiple emoji in a string that were joined by zero-width-joiners (ZWJ/``\\u200D``). This class is only used for emoji that are not "recommended for general interchange" (non-RGI) by Unicode.org. The data property of this class is always None. """ def __init__(self, first_emoji_match: EmojiMatch, second_emoji_match: EmojiMatch): self.emojis = [first_emoji_match, second_emoji_match] """List of sub emoji as EmojiMatch objects""" self._update() def _update(self): self.emoji = _ZWJ.join(e.emoji for e in self.emojis) self.start = self.emojis[0].start self.end = self.emojis[-1].end self.data = None def _add(self, next_emoji_match: EmojiMatch): self.emojis.append(next_emoji_match) self._update()
[docs] class Token(NamedTuple): """ A named tuple containing the matched string and its :class:`EmojiMatch` object if it is an emoji or a single character that is not a unicode emoji. """ chars: str value: Union[str, EmojiMatch]
def tokenize(string, keep_zwj: bool) -> Iterator[Token]: """ Finds unicode emoji in a string. Yields all normal characters as a named tuple :class:`Token` ``(char, char)`` and all emoji as :class:`Token` ``(chars, EmojiMatch)``. :param string: String contains unicode characters. MUST BE UNICODE. :param keep_zwj: Should ZWJ-characters (``\\u200D``) that join non-RGI emoji be skipped or should be yielded as normal characters :return: An iterable of tuples :class:`Token` ``(char, char)`` or :class:`Token` ``(chars, EmojiMatch)`` """ tree = get_search_tree() EMOJI_DATA = unicode_codes.EMOJI_DATA # result: [ Token(oldsubstring0, EmojiMatch), Token(char1, char1), ... ] result = [] i = 0 length = len(string) ignore = [] # index of chars in string that are skipped, i.e. the ZWJ-char in non-RGI-ZWJ-sequences while i < length: consumed = False char = string[i] if i in ignore: i += 1 if char == _ZWJ and keep_zwj: result.append(Token(char, char)) continue elif char in tree: j = i + 1 sub_tree = tree[char] while j < length and string[j] in sub_tree: if j in ignore: break sub_tree = sub_tree[string[j]] j += 1 if 'data' in sub_tree: emj_data = sub_tree['data'] code_points = string[i:j] # We cannot yield the result here, we need to defer # the call until we are sure that the emoji is finished # i.e. we're not inside an ongoing ZWJ-sequence match_obj = EmojiMatch(code_points, i, j, emj_data) i = j - 1 consumed = True result.append(Token(code_points, match_obj)) elif char == _ZWJ and result and result[-1].chars in EMOJI_DATA and i > 0 and string[i - 1] in tree: # the current char is ZWJ and the last match was an emoji ignore.append(i) if EMOJI_DATA[result[-1].chars]["status"] == unicode_codes.STATUS["component"]: # last match was a component, it could be ZWJ+EMOJI+COMPONENT # or ZWJ+COMPONENT i = i - sum(len(t.chars) for t in result[-2:]) if string[i] == _ZWJ: # It's ZWJ+COMPONENT, move one back i += 1 del result[-1] else: # It's ZWJ+EMOJI+COMPONENT, move two back del result[-2:] else: # last match result[-1] was a normal emoji, move cursor # before the emoji i = i - len(result[-1].chars) del result[-1] continue elif result: yield from result result = [] if not consumed and char != '\uFE0E' and char != '\uFE0F': result.append(Token(char, char)) i += 1 yield from result def filter_tokens(matches: Iterator[Token], emoji_only: bool, join_emoji: bool) -> Iterator[Token]: """ Filters the output of `tokenize()` :param matches: An iterable of tuples of the form ``(match_str, result)`` where ``result`` is either an EmojiMatch or a string. :param emoji_only: If True, only EmojiMatch are returned in the output. If False all characters are returned :param join_emoji: If True, multiple EmojiMatch are merged into a single :class:`EmojiMatchZWJNonRGI` if they are separated only by a ZWJ. :return: An iterable of tuples :class:`Token` ``(char, char)``, :class:`Token` ``(chars, EmojiMatch)`` or :class:`Token` ``(chars, EmojiMatchZWJNonRGI)`` """ if not join_emoji and not emoji_only: yield from matches return if not join_emoji: for token in matches: if token.chars != _ZWJ: yield token return # Combine multiple EmojiMatch that are separated by ZWJs into # a single EmojiMatchZWJNonRGI previous_is_emoji = False previous_is_zwj = False pre_previous_is_emoji = False accumulator = [] for token in matches: pre_previous_is_emoji = previous_is_emoji if previous_is_emoji and token.value == _ZWJ: previous_is_zwj = True elif isinstance(token.value, EmojiMatch): if pre_previous_is_emoji and previous_is_zwj: if isinstance(accumulator[-1].value, EmojiMatchZWJNonRGI): accumulator[-1].value._add(token.value) accumulator[-1] = Token(accumulator[-1].chars + _ZWJ + token.chars, accumulator[-1].value) else: prev = accumulator.pop() accumulator.append( Token(prev.chars + _ZWJ + token.chars, EmojiMatchZWJNonRGI( prev.value, token.value))) else: accumulator.append(token) previous_is_emoji = True previous_is_zwj = False else: # Other character, not an emoji previous_is_emoji = False previous_is_zwj = False yield from accumulator if not emoji_only: yield token accumulator = [] yield from accumulator def get_search_tree() -> Dict[str, Any]: """ Generate a search tree for demojize(). Example of a search tree:: EMOJI_DATA = {'a': {'en': ':Apple:'}, 'b': {'en': ':Bus:'}, 'ba': {'en': ':Bat:'}, 'band': {'en': ':Beatles:'}, 'bandit': {'en': ':Outlaw:'}, 'bank': {'en': ':BankOfEngland:'}, 'bb': {'en': ':BB-gun:'}, 'c': {'en': ':Car:'}} _SEARCH_TREE = {'a': {'data': {'en': ':Apple:'}}, 'b': {'a': {'data': {'en': ':Bat:'}, 'n': {'d': {'data': {'en': ':Beatles:'}, 'i': {'t': {'data': {'en': ':Outlaw:'}}}}, 'k': {'data': {'en': ':BankOfEngland:'}}}}, 'b': {'data': {'en': ':BB-gun:'}}, 'data': {'en': ':Bus:'}}, 'c': {'data': {'en': ':Car:'}}} _SEARCH_TREE / | ⧵ / | ⧵ a b c | / | ⧵ | | / | ⧵ | :Apple: ba :Bus: bb :Car: / ⧵ | / ⧵ | :Bat: ban :BB-gun: / ⧵ / ⧵ band bank / ⧵ | / ⧵ | bandi :Beatles: :BankOfEngland: | bandit | :Outlaw: """ global _SEARCH_TREE if _SEARCH_TREE is None: _SEARCH_TREE = {} for emj in unicode_codes.EMOJI_DATA: sub_tree = _SEARCH_TREE lastidx = len(emj) - 1 for i, char in enumerate(emj): if char not in sub_tree: sub_tree[char] = {} sub_tree = sub_tree[char] if i == lastidx: sub_tree['data'] = unicode_codes.EMOJI_DATA[emj] return _SEARCH_TREE