update_emoji.py 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213
  1. """
  2. Update Emoji.py
  3. Refreshes OMZ emoji database based on the latest Unicode spec
  4. """
  5. import re
  6. import json
  7. spec = open("emoji-data.txt", "r")
  8. # Regexes
  9. # regex_emoji will return, respectively:
  10. # the code points, its type (status), the actual emoji, and its official name
  11. regex_emoji = r"^([\w ].*?\S)\s*;\s*([\w-]+)\s*#\s*(.*?)\s(\S.*).*$"
  12. # regex_group returns the group of subgroup that a line opens
  13. regex_group = r"^#\s*(group|subgroup):\s*(.*)$"
  14. headers = """
  15. # emoji-char-definitions.zsh - Emoji definitions for oh-my-zsh emoji plugin
  16. #
  17. # This file is auto-generated by update_emoji.py. Do not edit it manually.
  18. #
  19. # This contains the definition for:
  20. # $emoji - which maps character names to Unicode characters
  21. # $emoji_flags - maps country names to Unicode flag characters using region
  22. # indicators
  23. # $emoji_mod - maps modifier components to Unicode characters
  24. # $emoji_groups - a single associative array to avoid cluttering up the
  25. # global namespace, and to allow adding additional group
  26. # definitions at run time. The keys are the group names, and
  27. # the values are whitespace-separated lists of emoji
  28. # character names.
  29. # Main emoji
  30. typeset -gAH emoji
  31. # National flags
  32. typeset -gAH emoji_flags
  33. # Combining modifiers
  34. typeset -gAH emoji_mod
  35. # Emoji groups
  36. typeset -gAH emoji_groups
  37. """
  38. #######
  39. # Adding country codes
  40. #######
  41. # This is the only part of this script that relies on an external library
  42. # (country_converter), and is hence commented out by default.
  43. # You can uncomment it to have country codes added as aliases for flag
  44. # emojis. (By default, when you install this extension, country codes are
  45. # included as aliases, but not if you re-run this script without uncommenting.)
  46. # Warning: country_converter is very verbose, and will print warnings all over
  47. # your terminal.
  48. # import country_converter as coco # pylint: disable=wrong-import-position
  49. # cc = coco.CountryConverter()
  50. # def country_iso(_all_names, _omz_name):
  51. # """ Using the external library country_converter,
  52. # this function can detect the ISO2 and ISO3 codes
  53. # of the country. It takes as argument the array
  54. # with all the names of the emoji, and returns that array."""
  55. # omz_no_underscore = re.sub(r'_', r' ', _omz_name)
  56. # iso2 = cc.convert(names=[omz_no_underscore], to='ISO2')
  57. # if iso2 != 'not found':
  58. # _all_names.append(iso2)
  59. # iso3 = cc.convert(names=[omz_no_underscore], to='ISO3')
  60. # _all_names.append(iso3)
  61. # return _all_names
  62. #######
  63. # Helper functions
  64. #######
  65. def code_to_omz(_code_points):
  66. """ Returns a ZSH-compatible Unicode string from the code point(s) """
  67. return r'\U' + r'\U'.join(_code_points.split(' '))
  68. def name_to_omz(_name, _group, _subgroup, _status):
  69. """ Returns a reasonable snake_case name for the emoji. """
  70. def snake_case(_string):
  71. """ Does the regex work of snake_case """
  72. remove_dots = re.sub(r'\.\(\)', r'', _string)
  73. replace_ands = re.sub(r'\&', r'and', remove_dots)
  74. remove_whitespace = re.sub(r'[^\#\*\w]', r'_', replace_ands)
  75. return re.sub(r'__', r'_', remove_whitespace)
  76. shortname = ""
  77. split_at_colon = lambda s: s.split(": ")
  78. # Special treatment by group and subgroup
  79. # If the emoji is a flag, we strip "flag" from its name
  80. if _group == "Flags" and len(split_at_colon(_name)) > 1:
  81. shortname = snake_case(split_at_colon(_name)[1])
  82. else:
  83. shortname = snake_case(_name)
  84. # Special treatment by status
  85. # Enables us to have every emoji combination,
  86. # even the one that are not officially sanctioned
  87. # and are implemented by, say, only one vendor
  88. if _status == "unqualified":
  89. shortname += "_unqualified"
  90. elif _status == "minimally-qualified":
  91. shortname += "_minimally"
  92. return shortname
  93. def increment_name(_shortname):
  94. """ Increment the short name by 1. If you get, say,
  95. 'woman_detective_unqualified', it returns
  96. 'woman_detective_unqualified_1', and then
  97. 'woman_detective_unqualified_2', etc. """
  98. last_char = _shortname[-1]
  99. if last_char.isdigit():
  100. num = int(last_char)
  101. return _shortname[:-1] + str(num + 1)
  102. return _shortname + "_1"
  103. ########
  104. # Going through every line
  105. ########
  106. group, subgroup, short_name_buffer = "", "", ""
  107. emoji_database = []
  108. for line in spec:
  109. # First, test if this line opens a group or subgroup
  110. group_match = re.findall(regex_group, line)
  111. if group_match != []:
  112. gr_or_sub, name = group_match[0]
  113. if gr_or_sub == "group":
  114. group = name
  115. elif gr_or_sub == "subgroup":
  116. subgroup = name
  117. continue # Moving on...
  118. # Second, test if this line references one emoji
  119. emoji_match = re.findall(regex_emoji, line)
  120. if emoji_match != []:
  121. code_points, status, emoji, name = emoji_match[0]
  122. omz_codes = code_to_omz(code_points)
  123. omz_name = name_to_omz(name, group, subgroup, status)
  124. # If this emoji has the same shortname as the preceding one
  125. if omz_name in short_name_buffer:
  126. omz_name = increment_name(short_name_buffer)
  127. short_name_buffer = omz_name
  128. emoji_database.append(
  129. [omz_codes, status, emoji, omz_name, group, subgroup])
  130. spec.close()
  131. ########
  132. # Write to emoji-char-definitions.zsh
  133. ########
  134. # Aliases for emojis are retrieved through the DB of Gemoji
  135. # Retrieved on Aug 9 2019 from the following URL:
  136. # https://raw.githubusercontent.com/github/gemoji/master/db/emoji.json
  137. gemoji_db = open("gemoji_db.json")
  138. j = json.load(gemoji_db)
  139. aliases_map = {entry['emoji']: entry['aliases'] for entry in j}
  140. all_omz_names = [emoji_data[3] for emoji_data in emoji_database]
  141. # Let's begin writing to this file
  142. output = open("emoji-char-definitions.zsh", "w")
  143. output.write(headers)
  144. emoji_groups = {"fruits": "\n", "vehicles": "\n", "hands": "\n",
  145. "people": "\n", "animals": "\n", "faces": "\n",
  146. "flags": "\n"}
  147. # First, write every emoji down
  148. for _omz_codes, _status, _emoji, _omz_name, _group, _subgroup in emoji_database:
  149. # One emoji can be mapped to multiple names (aliases or country codes)
  150. names_for_this_emoji = [_omz_name]
  151. # Variable that indicates in which map the emoji will be located
  152. emoji_map = "emoji"
  153. if _status == "component":
  154. emoji_map = "emoji_mod"
  155. if _group == "Flags":
  156. emoji_map = "emoji_flags"
  157. # Adding country codes (Optional, see above)
  158. # names_for_this_emoji = country_iso(names_for_this_emoji, _omz_name)
  159. # Check if there is an alias available in the Gemoji DB
  160. if _emoji in aliases_map.keys():
  161. for alias in aliases_map[_emoji]:
  162. if alias not in all_omz_names:
  163. names_for_this_emoji.append(alias)
  164. # And now we write to the definitions file
  165. for one_name in names_for_this_emoji:
  166. output.write(f"{emoji_map}[{one_name}]=$'{_omz_codes}'\n")
  167. # Storing the emoji in defined subgroups for the next step
  168. if _status == "fully-qualified":
  169. if _subgroup == "food-fruit":
  170. emoji_groups["fruits"] += f" {_omz_name}\n"
  171. elif "transport-" in _subgroup:
  172. emoji_groups["vehicles"] += f" {_omz_name}\n"
  173. elif "hand-" in _subgroup:
  174. emoji_groups["hands"] += f" {_omz_name}\n"
  175. elif "person-" in _subgroup or _subgroup == "family":
  176. emoji_groups["people"] += f" {_omz_name}\n"
  177. elif "animal-" in _subgroup:
  178. emoji_groups["animals"] += f" {_omz_name}\n"
  179. elif "face-" in _subgroup:
  180. emoji_groups["faces"] += f" {_omz_name}\n"
  181. elif _group == "Flags":
  182. emoji_groups["flags"] += f" {_omz_name}\n"
  183. # Second, write the subgroups to the end of the file
  184. for name, string in emoji_groups.items():
  185. output.write(f'\nemoji_groups[{name}]="{string}"\n')
  186. output.close()