Import GGUF into Ollama

1.Create a Modelfile

##格式0

FROM tinyllama-my-model.gguf

### Set the system message
SYSTEM """
You are a super helpful helper.
"""

PARAMETER stop <s>
PARAMETER stop </s>

#格式0的运行方式：ollama run my-model "<s>\nQ: \nWhat is the capital of France?\nA:\n"

##格式1
FROM ./llama3-unsloth.Q8_0.gguf

TEMPLATE """{{- if .System }}
<|system|>
{{ .System }}
{{- end }}
<|user|>
{{ .Prompt }}
<|assistant|>
"""

SYSTEM """You are a helpful, smart, kind, and efficient AI assistant.Your name is Aila. You always fulfill the user's requests to the best of your ability."""


PARAMETER temperature 0.8
PARAMETER num_ctx 8192
PARAMETER stop "<|system|>"
PARAMETER stop "<|user|>"
PARAMETER stop "<|assistant|>"

##格式2
FROM GEITje-7B-chat-v2.gguf
TEMPLATE """{{- if .System }}
<|system|>
{{ .System }}
</s>
{{- end }}
<|user|>
{{ .Prompt }}
</s>
<|assistant|>
"""
PARAMETER temperature 0.2
PARAMETER num_ctx 8192
PARAMETER stop "<|system|>"
PARAMETER stop "<|user|>"
PARAMETER stop "<|assistant|>"
PARAMETER stop "</s>"

##格式3
FROM models/tinyllama-1.1b-chat-v0.3.Q6_K.gguf
PARAMETER temperature 0.7
PARAMETER stop "<|im_start|>"
PARAMETER stop "<|im_end|>"
TEMPLATE """
<|im_start|>system
{{ .System }}<|im_end|>
<|im_start|>user
{{ .Prompt }}<|im_end|>
<|im_start|>assistant
"""
SYSTEM """You are a helpful assistant."""

2.Build the Model in Ollama

ollama create example -f Modelfile

3.Run the Model

ollama run example

Windows微调配置-WSL2开启

安装Linux：https://learn.microsoft.com/zh-cn/windows/wsl/install

支持GUI：https://learn.microsoft.com/zh-cn/windows/wsl/tutorials/gui-apps

https://kz16.top/remote/wsl.html

安装ubuntu桌面环境:

sudo apt install ubuntu-desktop

开启GUI远程桌面请参考下面的内容：

#https://www.alibabacloud.com/help/en/simple-application-server/use-cases/use-vnc-to-build-guis-on-ubuntu-18-04-and-20-04#21e0b772d7fgc

apt-get update

apt install -y gnome-panel gnome-settings-daemon metacity nautilus gnome-terminal ubuntu-desktop

apt-get install tightvncserver

vncserver

cp ~/.vnc/xstartup ~/.vnc/xstartup.bak

vim ~/.vnc/xstartup

#Press I to enter the edit mode and replace the content of the configuration file with the following content.

#!/bin/sh
export XKL_XMODMAP_DISABLE=1
export XDG_CURRENT_DESKTOP="GNOME-Flashback:GNOME"
export XDG_MENU_PREFIX="gnome-flashback-"
gnome-session --session=gnome-flashback-metacity --disable-acceleration-check &

#Press the Esc key, enter :wq, and then press the Enter key to save and close the file.


vncserver -kill :1

vncserver -geometry 1920x1080 :1


#https://www.realvnc.com/en/connect/download/viewer/

unsloth微调phi 3

数据集:https://huggingface.co/datasets/laion/OIG/tree/main

查看显卡信息的指令 nvidia-smi

环境配置：


conda create --name unsloth_env python=3.10
conda activate unsloth_env

conda install pytorch-cuda=12.1 pytorch cudatoolkit xformers -c pytorch -c nvidia -c xformers

pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes

微调代码

from unsloth import FastLanguageModel
import torch
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import load_dataset
max_seq_length = 2048 # Supports RoPE Scaling interally, so choose any!
# Get LAION dataset
url = "https://huggingface.co/datasets/laion/OIG/resolve/main/unified_chip2.jsonl"
dataset = load_dataset("json", data_files = {"train" : url}, split = "train")

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",
    "unsloth/gemma-7b-it-bnb-4bit", # Instruct version of Gemma 7b
    "unsloth/gemma-2b-bnb-4bit",
    "unsloth/gemma-2b-it-bnb-4bit", # Instruct version of Gemma 2b
    "unsloth/llama-3-8b-bnb-4bit", # [NEW] 15 Trillion token Llama-3
    "unsloth/Phi-3-mini-4k-instruct-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = None,
    load_in_4bit = True,
)

# Do model patching and add fast LoRA weights
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    max_seq_length = max_seq_length,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

trainer = SFTTrainer(
    model = model,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    tokenizer = tokenizer,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 10,
        max_steps = 60,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        output_dir = "outputs",
        optim = "adamw_8bit",
        seed = 3407,
    ),
)
trainer.train()

model.save_pretrained_gguf("phit", tokenizer, quantization_method = "q4_k_m")
model.save_pretrained_gguf("phit", tokenizer, quantization_method = "f16")


model.save_pretrained_merged("phi3test", tokenizer, save_method = "16bit",)
model.push_to_hub_gguf("leo009/phi3test", tokenizer, quantization_method = "16bit")

safesensor转为gguf：


#项目地址 https://github.com/ggerganov/llama.cpp

cd llama.cpp && make clean && make all -j

python convert.py /home/Ubuntu/phit/model-00001-of-00002.safetensors --outfile /home/Ubuntu/phit/your_output.gguf --pad-vocab

AutoGen代码


llama3 = {
    "config_list": [
        {
            "model": "模型",
            "base_url": "http://localhost:1234/v1",
            "api_key": "lm-studio",
        },
    ],
    "cache_seed": None,  # Disable caching.
}

phi3 = {
    "config_list": [
        {
            "model": "模型",
            "base_url": "http://localhost:1234/v1",
            "api_key": "lm-studio",
        },
    ],
    "cache_seed": None,  # Disable caching.
}




from autogen import ConversableAgent

jack = ConversableAgent(
    "Jack (Phi-3)",
    llm_config=phi3,
    #system_message="Your name is Jack and you are a comedian in a two-person comedy show.",
    system_message="你的名字叫Jack，你是一个中文AI作家。你的角色是根据指定主题创作引人入胜且信息丰富的文章，并且根据你的同事Emma的建议来修改和完善你创作的文章，每当你收到Emma的建议时，都要根据Emma的建议给出修改和完善后的完整文章。",
)
emma = ConversableAgent(
    "Emma (llama3)",
    llm_config=llama3,
    #system_message="Your name is Emma and you are a comedian in two-person comedy show.",
    system_message="你的名字叫Emma，你的角色是一个中文AI文章评审员。你的任务是针对你的同事Jack所写的文章评估并提出改进建议，每次对话你都要对文章作出评估并给出修改建议。",

)

chat_result = emma.initiate_chat(jack, message="Jack，请用中文写一篇关于科学家穿遇到未来的文章。", max_turns=3)

Import GGUF into Ollama

如有问题请联系我的徽信:stoeng

Windows微调配置-WSL2开启

unsloth微调phi 3

环境配置：

微调代码

safesensor转为gguf：

如有问题请联系我的徽信:stoeng

AutoGen代码