Skip to content

Latest commit

 

History

History
113 lines (99 loc) · 3.18 KB

PHI3.5MOE.md

File metadata and controls

113 lines (99 loc) · 3.18 KB

The Phi 3.5 MoE model is a 16x3.8B parameter decoder-only text-to-text mixture of expert LLM.

  • Context length of 128k tokens
  • Trained on 4.9T tokens
  • 16 experts (16x3.8B parameters) with 6.6B active parameters
  • Expect inference performance of a 7B model

About the MoE mechanism

  1. Compute router gating logits
  2. From the router gating logits, select the top-2 selected experts and the associated weights
  3. The hidden states for each token in the sequence is computed by (if selected) applying the expert output to that token, and then weighting it.
    • If multiple experts are selected for the token, then this becomes a weighted sum
    • The design is flexible: 2 or 1 experts can be selected, enabling dense or sparse gating
./mistralrs-server --isq Q4K -i plain -m microsoft/Phi-3.5-MoE-instruct -a phi3.5moe

HTTP API

import openai

messages = []
prompt = input("Enter system prompt >>> ")
if len(prompt) > 0:
    messages.append({"role": "system", "content": prompt})


while True:
    prompt = input(">>> ")
    messages.append({"role": "user", "content": prompt})
    completion = client.chat.completions.create(
        model="phi3.5moe",
        messages=messages,
        max_tokens=256,
        frequency_penalty=1.0,
        top_p=0.1,
        temperature=0,
    )
    resp = completion.choices[0].message.content
    print(resp)
    messages.append({"role": "assistant", "content": resp})

Python API

from mistralrs import Runner, Which, ChatCompletionRequest, Architecture

runner = Runner(
    which=Which.Plain(
        model_id="microsoft/Phi-3.5-MoE-instruct",
        arch=Architecture.Phi3_5MoE ,
    ),
)

res = runner.send_chat_completion_request(
    ChatCompletionRequest(
        model="mistral",
        messages=[
            {"role": "user", "content": "Tell me a story about the Rust type system."}
        ],
        max_tokens=256,
        presence_penalty=1.0,
        top_p=0.1,
        temperature=0.1,
    )
)
print(res.choices[0].message.content)
print(res.usage)

Rust API

You can find this example here.

fn setup() -> anyhow::Result<Arc<MistralRs>> {
    // Select a Mistral model
    let loader = NormalLoaderBuilder::new(
        NormalSpecificConfig {
            use_flash_attn: false,
            prompt_batchsize: None,
            topology: None,
            organization: Default::default(),
        },
        None,
        None,
        Some("microsoft/Phi-3.5-MoE-instruct".to_string()),
    )
    .build(NormalLoaderType::Phi3_5MoE)?;
    // Load, into a Pipeline
    let pipeline = loader.load_model_from_hf(
        None,
        TokenSource::CacheToken,
        &ModelDType::Auto,
        &best_device()?,
        false,
        DeviceMapMetadata::dummy(),
        Some(IsqType::Q4K),
        None, // No PagedAttention.
    )?;
    // Create the MistralRs, which is a runner
    Ok(MistralRsBuilder::new(
        pipeline,
        SchedulerConfig::DefaultScheduler {
            method: DefaultSchedulerMethod::Fixed(5.try_into().unwrap()),
        },
    )
    .build())
}