Phi 3.5 Model: microsoft/Phi-3.5-MoE-instruct
The Phi 3.5 MoE model is a 16x3.8B parameter decoder-only text-to-text mixture of expert LLM.
- Context length of 128k tokens
- Trained on 4.9T tokens
- 16 experts (16x3.8B parameters) with 6.6B active parameters
- Expect inference performance of a 7B model
- Compute router gating logits
- From the router gating logits, select the top-2 selected experts and the associated weights
- The hidden states for each token in the sequence is computed by (if selected) applying the expert output to that token, and then weighting it.
- If multiple experts are selected for the token, then this becomes a weighted sum
- The design is flexible: 2 or 1 experts can be selected, enabling dense or sparse gating
./mistralrs-server --isq Q4K -i plain -m microsoft/Phi-3.5-MoE-instruct -a phi3.5moe
import openai
messages = []
prompt = input("Enter system prompt >>> ")
if len(prompt) > 0:
messages.append({"role": "system", "content": prompt})
while True:
prompt = input(">>> ")
messages.append({"role": "user", "content": prompt})
completion = client.chat.completions.create(
model="phi3.5moe",
messages=messages,
max_tokens=256,
frequency_penalty=1.0,
top_p=0.1,
temperature=0,
)
resp = completion.choices[0].message.content
print(resp)
messages.append({"role": "assistant", "content": resp})
from mistralrs import Runner, Which, ChatCompletionRequest, Architecture
runner = Runner(
which=Which.Plain(
model_id="microsoft/Phi-3.5-MoE-instruct",
arch=Architecture.Phi3_5MoE ,
),
)
res = runner.send_chat_completion_request(
ChatCompletionRequest(
model="mistral",
messages=[
{"role": "user", "content": "Tell me a story about the Rust type system."}
],
max_tokens=256,
presence_penalty=1.0,
top_p=0.1,
temperature=0.1,
)
)
print(res.choices[0].message.content)
print(res.usage)
You can find this example here.
fn setup() -> anyhow::Result<Arc<MistralRs>> {
// Select a Mistral model
let loader = NormalLoaderBuilder::new(
NormalSpecificConfig {
use_flash_attn: false,
prompt_batchsize: None,
topology: None,
organization: Default::default(),
},
None,
None,
Some("microsoft/Phi-3.5-MoE-instruct".to_string()),
)
.build(NormalLoaderType::Phi3_5MoE)?;
// Load, into a Pipeline
let pipeline = loader.load_model_from_hf(
None,
TokenSource::CacheToken,
&ModelDType::Auto,
&best_device()?,
false,
DeviceMapMetadata::dummy(),
Some(IsqType::Q4K),
None, // No PagedAttention.
)?;
// Create the MistralRs, which is a runner
Ok(MistralRsBuilder::new(
pipeline,
SchedulerConfig::DefaultScheduler {
method: DefaultSchedulerMethod::Fixed(5.try_into().unwrap()),
},
)
.build())
}