Skip to content

Commit

Permalink
chore: use tiktoken instead of js-tiktoken (#4094)
Browse files Browse the repository at this point in the history
  • Loading branch information
Aaaaash authored Oct 15, 2024
1 parent a2c4de0 commit 33b2de6
Show file tree
Hide file tree
Showing 5 changed files with 23 additions and 19 deletions.
2 changes: 1 addition & 1 deletion packages/ai-native/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,9 @@
"@xterm/xterm": "5.5.0",
"ansi-regex": "^2.0.0",
"dom-align": "^1.7.0",
"js-tiktoken": "1.0.12",
"react-chat-elements": "^12.0.10",
"react-highlight": "^0.15.0",
"tiktoken": "1.0.12",
"web-tree-sitter": "0.22.6"
},
"devDependencies": {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
// @ts-ignore
import { Tiktoken } from 'js-tiktoken';
import { Tiktoken } from 'tiktoken/lite';

import { Injector } from '@opensumi/di';

Expand Down Expand Up @@ -59,7 +58,12 @@ export const getMarkerForSnippets = (text: string, language: string) => {
return lines.map((line) => getMarkerByLanguage(line, language)).join('\n');
};

export const getCroppedTextByLine = (text: string, maxTokenSize: number, textTokens: number[][], reverse = false) => {
export const getCroppedTextByLine = (
text: string,
maxTokenSize: number,
textTokens: Uint32Array[],
reverse = false,
) => {
const currentTokenSize = textTokens.reduce((prev, cur) => prev + cur.length, 0);
if (currentTokenSize < maxTokenSize) {
return text;
Expand Down Expand Up @@ -110,15 +114,15 @@ export const getCroppedTextByLine = (text: string, maxTokenSize: number, textTok
export const getCroppedText = async (
text: string,
maxTokenSize: number,
textTokens: number[][],
textTokens: Uint32Array[],
strategy = StrategyType.InterceptBasedOnLine,
tokenizer: Tiktoken,
parser?: LanguageParser,
minBlockSize = 20,
reverse = false,
token?: monaco.CancellationToken,
): Promise<string> => {
let tokens: number[];
let tokens: Uint32Array;
if (strategy === StrategyType.InterceptBasedOnLine) {
// 按行进行裁剪
text = getCroppedTextByLine(text, maxTokenSize, textTokens, reverse);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
// @ts-ignore
import { Tiktoken, getEncoding } from 'js-tiktoken';
import { Tiktoken, get_encoding } from 'tiktoken';

import { TokenizerName } from '../types';

Expand All @@ -10,7 +9,7 @@ export const getTokenizer = (tokenizerName = TokenizerName.cl100k_base) => {
if (tokenizer) {
return tokenizer;
}
tokenizer = getEncoding('cl100k_base');
tokenizer = get_encoding('cl100k_base');
TOKENIZER_CACHE.set(tokenizerName, tokenizer);
return tokenizer;
};
3 changes: 3 additions & 0 deletions tools/dev-tool/src/webpack.js
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ exports.createWebpackConfig = function (dir, entry, extraConfig) {
cache: {
type: 'filesystem',
},
experiments: {
asyncWebAssembly: true, // 启用 WebAssembly 支持
},
resolve: {
extensions: ['.ts', '.tsx', '.js', '.json', '.less'],
plugins: [
Expand Down
18 changes: 8 additions & 10 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -2265,9 +2265,9 @@ __metadata:
"@xterm/xterm": "npm:5.5.0"
ansi-regex: "npm:^2.0.0"
dom-align: "npm:^1.7.0"
js-tiktoken: "npm:1.0.12"
react-chat-elements: "npm:^12.0.10"
react-highlight: "npm:^0.15.0"
tiktoken: "npm:1.0.12"
web-tree-sitter: "npm:0.22.6"
languageName: unknown
linkType: soft
Expand Down Expand Up @@ -12221,15 +12221,6 @@ __metadata:
languageName: node
linkType: hard

"js-tiktoken@npm:1.0.12":
version: 1.0.12
resolution: "js-tiktoken@npm:1.0.12"
dependencies:
base64-js: "npm:^1.5.1"
checksum: 10/21aaa9302409fefc5ac18695579b04e0223cdda2566e5cc4a95de228333bbadfc2e16110fceca1824d7faa43081ef378e6bc72238a2230374a85f88638556305
languageName: node
linkType: hard

"js-tokens@npm:^3.0.0 || ^4.0.0, js-tokens@npm:^4.0.0":
version: 4.0.0
resolution: "js-tokens@npm:4.0.0"
Expand Down Expand Up @@ -18889,6 +18880,13 @@ __metadata:
languageName: node
linkType: hard

"tiktoken@npm:1.0.12":
version: 1.0.12
resolution: "tiktoken@npm:1.0.12"
checksum: 10/595ab2e93f1937a1af8baf057f1a2951cfd7b5d4c0986807c1fc8e1a2118800a31cec81cdbcb115c4be5eec4fd9d5c23ac23691a227658c1010aeefec5dfa3ea
languageName: node
linkType: hard

"timeago.js@npm:^4.0.2":
version: 4.0.2
resolution: "timeago.js@npm:4.0.2"
Expand Down

0 comments on commit 33b2de6

Please sign in to comment.