brew install pipx
pipx ensurepath # as needed
pipx install mlx-lm
pipx inject mlx-lm tiktoken # for Kimi-Linear

For API,

# Download the model
mlx_lm.server --model mlx-community/Kimi-Linear-48B-A3B-Instruct-4bit --port <port> --max-tokens=200000
 
# Run in offline mode after reviewing custom code for security
HF_HUB_OFFLINE=1 mlx_lm.server --model mlx-community/Kimi-Linear-48B-A3B-Instruct-4bit --port 6599 --max-tokens=200000 --trust-remote-code

For simple chat,

mlx_lm.chat --model mlx-community/Kimi-Linear-48B-A3B-Instruct-4bit
 
# or with `llm` via the API server:
pipx install llm
 
cat > "$(dirname "$(llm logs path)")"/extra-openai-models.yaml <<'EOF'
- model_id: kimi-linear
  model_name: mlx-community/Kimi-Linear-48B-A3B-Instruct-4bit
  api_base: "http://localhost:6599"
EOF
llm models default kimi-linear

The cache directory is .cache/huggingface for models from Hugging Face repo.

Note:Kimi-Linear-48B-A3B-Instruct-4bit requires mlx-lm version 0.28.5 and 26.1G of RAM. This amounts to 41% of 64 GB memory Macs, so 32B or less models are better.

References