Skip to content

Commit

Permalink
refactor and add Emoji 15.1
Browse files Browse the repository at this point in the history
refactors detect function to not use regex!

closes #26
closes #25
  • Loading branch information
aaronpk committed Feb 19, 2024
1 parent 253048c commit 1fd7d46
Show file tree
Hide file tree
Showing 7 changed files with 396 additions and 58 deletions.
22 changes: 18 additions & 4 deletions build/build.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
// https://github.com/iamcal/emoji-data
// https://raw.githubusercontent.com/iamcal/emoji-data/master/emoji_pretty.json

$emoji_data = json_decode(file_get_contents('https://raw.githubusercontent.com/iamcal/emoji-data/10073d54cccceeba32a9d3199601b7d46fa9c0ac/emoji_pretty.json'), true);
$emoji_data = json_decode(file_get_contents('https://raw.githubusercontent.com/iamcal/emoji-data/master/emoji_pretty.json'), true);

$map = [];

Expand Down Expand Up @@ -56,9 +56,23 @@
usort($keys,function($a,$b){
return strlen($b)-strlen($a);
});
$all = preg_replace('/\-?([0-9a-f]+)/i', '\x{$1}', implode('|', $keys));

file_put_contents(dirname(__FILE__).'/../src/regexp.json', json_encode($all));
echo "Found ".count($keys)." emoji\n";

$codepoints = [];
$baseCodepoints = [];
foreach($keys as $key) {
$str = "";
$parts = explode('-', $key);
foreach($parts as $part) {
$str .= mb_chr(hexdec($part));
}
$codepoints[] = $str;

if(count($parts) == 1) {
$baseCodepoints[] = mb_chr(hexdec($parts[0]));
}
}
file_put_contents(__DIR__.'/../src/base-codepoints.json', json_encode($baseCodepoints));

echo "Found ".count($keys)." emoji\n";

163 changes: 116 additions & 47 deletions src/Emoji.php
Original file line number Diff line number Diff line change
Expand Up @@ -15,59 +15,102 @@ function detect_emoji($string) {
if(!isset($map))
$map = _load_map();

static $regexp;
if(!isset($regexp))
$regexp = _load_regexp();
static $baseCodepoints;
if(!isset($baseCodepoints))
$baseCodepoints = _load_basecodepoints();

$codepoints = mb_str_split($string);
$emojiChars = [];

$currentEmoji = null;
$includeNext = false;
foreach($codepoints as $cp) {
if($currentEmoji == null) {
if(in_array($cp, $baseCodepoints)) {
$currentEmoji = $cp;
} elseif(_is_country_flag($cp)) {
$currentEmoji = $cp;
$includeNext = true; // Flags are always 2 chars so grab the next one
}
} else {
if($includeNext) {
$currentEmoji .= $cp;
$includeNext = false;
}
elseif(_is_modifier($cp)) {
// If this codepoint is a modifier, add it now
$currentEmoji .= $cp;
$includeNext = false;
} elseif(_is_zwj($cp)) {
// If this codepoint is a ZWJ, include the next codepoint in the emoji as well
$currentEmoji .= $cp;
$includeNext = true;
} else {
$emojiChars[] = $currentEmoji;
$currentEmoji = null;

if(in_array($cp, $baseCodepoints)) {
$currentEmoji = $cp;
} elseif(_is_country_flag($cp)) {
$currentEmoji = $cp;
$includeNext = true; // Flags are always 2 chars so grab the next one
}
}
}
}
if($currentEmoji) {
$emojiChars[] = $currentEmoji;
}

if(preg_match_all($regexp, $string, $matches, PREG_OFFSET_CAPTURE)) {
$lastGOffset = 0;
foreach($matches[0] as $match) {
$ch = $match[0]; // the actual emoji char found by the regex, may be multiple bytes
$mbLength = mb_strlen($ch); // the length of the emoji, mb chars are counted as 1
// Now we have a list of individual completed emoji chars in the order they are in the string.

$offset = $match[1];
$lastGOffset = 0;
$lastOffset = 0;

// echo mb_strlen($string)." found emoji length: ".strlen($ch)." lastGOffset: $lastGOffset mbLength: $mbLength\n";
foreach($emojiChars as $emoji) {
$mbLength = mb_strlen($emoji); // the length of the emoji, mb chars are counted as 1

$gOffset = grapheme_strpos($string, $ch, $lastGOffset);
$lastGOffset = $gOffset+1;
$offset = strpos($string, $emoji, $lastOffset);
$lastOffset = $offset + strlen($emoji);

$points = array();
for($i=0; $i<$mbLength; $i++) {
$points[] = strtoupper(dechex(uniord(mb_substr($ch, $i, 1))));
}
$hexstr = implode('-', $points);
$gOffset = grapheme_strpos($string, $emoji, $lastGOffset);
$lastGOffset = $gOffset + 1;

if(array_key_exists($hexstr, $map)) {
$short_name = $map[$hexstr];
} else {
$short_name = null;
}
$points = array();
for($i=0; $i<$mbLength; $i++) {
$points[] = strtoupper(dechex(uniord(mb_substr($emoji, $i, 1))));
}
$hexstr = implode('-', $points);

$skin_tone = null;
$skin_tones = array(
'1F3FB' => 'skin-tone-2',
'1F3FC' => 'skin-tone-3',
'1F3FD' => 'skin-tone-4',
'1F3FE' => 'skin-tone-5',
'1F3FF' => 'skin-tone-6',
);
foreach($points as $pt) {
if(array_key_exists($pt, $skin_tones))
$skin_tone = $skin_tones[$pt];
}
if(array_key_exists($hexstr, $map)) {
$short_name = $map[$hexstr];
} else {
$short_name = null;
}

$data[] = array(
'emoji' => $ch,
'short_name' => $short_name,
'num_points' => mb_strlen($ch),
'points_hex' => $points,
'hex_str' => $hexstr,
'skin_tone' => $skin_tone,
'byte_offset' => $offset, // The position of the emoji in the string, counting each byte
'grapheme_offset' => $gOffset, // The grapheme-based position of the emoji in the string
);
$skin_tone = null;
$skin_tones = array(
'1F3FB' => 'skin-tone-2',
'1F3FC' => 'skin-tone-3',
'1F3FD' => 'skin-tone-4',
'1F3FE' => 'skin-tone-5',
'1F3FF' => 'skin-tone-6',
);
foreach($points as $pt) {
if(array_key_exists($pt, $skin_tones))
$skin_tone = $skin_tones[$pt];
}

$data[] = array(
'emoji' => $emoji,
'short_name' => $short_name,
'num_points' => mb_strlen($emoji),
'points_hex' => $points,
'hex_str' => $hexstr,
'skin_tone' => $skin_tone,
'byte_offset' => $offset, // The position of the emoji in the string, counting each byte
'grapheme_offset' => $gOffset, // The grapheme-based position of the emoji in the string
);
}

if($prev_encoding)
Expand Down Expand Up @@ -145,12 +188,38 @@ function remove_emoji($string, $opts=[]) {
return $string;
}

function _is_modifier($cp) {
$modifiers = [
"\u{1F3FB}",
"\u{1F3FC}",
"\u{1F3FD}",
"\u{1F3FE}",
"\u{1F3FF}",
"\u{FE0F}",
];
// Flag letters for subdivision flags
$modifiers = array_merge($modifiers, [
"\u{E0067}", "\u{E0062}", "\u{E0063}", "\u{E0065}",
"\u{E006C}", "\u{E006E}", "\u{E0073}", "\u{E0074}", "\u{E0077}",
"\u{E007F}", // Terminator
]);
return in_array($cp, $modifiers);
}

function _is_zwj($cp) {
return $cp == "\u{200D}";
}

function _is_country_flag($cp) {
return mb_ord("\u{1F1E6}") <= mb_ord($cp) && mb_ord($cp) <= mb_ord("\u{1F1FF}");
}

function _load_map() {
return json_decode(file_get_contents(dirname(__FILE__).'/map.json'), true);
return json_decode(file_get_contents(__DIR__.'/map.json'), true);
}

function _load_regexp() {
return '/(?:' . json_decode(file_get_contents(dirname(__FILE__).'/regexp.json')) . ')/u';
function _load_basecodepoints() {
return json_decode(file_get_contents(__DIR__.'/base-codepoints.json'), true);
}

function uniord($c) {
Expand Down
1 change: 1 addition & 0 deletions src/base-codepoints.json

Large diffs are not rendered by default.

Loading

0 comments on commit 1fd7d46

Please sign in to comment.