Skip to content

27.1 大语言模型基础

大语言模型概述

大语言模型(Large Language Models,LLMs)是现代AI系统的核心组件,它们通过在海量文本数据上进行预训练,学习到了丰富的语言知识和推理能力。Claude Code正是基于强大的大语言模型构建的编程助手。

大语言模型的基本概念

1. 什么是大语言模型

大语言模型是一种基于深度学习的AI模型,具有以下特点:

  • 大规模参数 : 模型参数数量从几十亿到数千亿不等
  • 海量训练数据 : 在互联网文本、代码、书籍等大规模数据集上训练
  • 强大的泛化能力 : 能够处理各种自然语言和编程任务
  • 上下文理解 : 能够理解长文本和复杂上下文

2. 大语言模型的架构

示例:简化的大语言模型架构

用户请求: "解释大语言模型的基本架构"

Claude Code 生成的解释:

大语言模型架构

Transformer 架构

bash


    嵌入层 (Embedding Layer)

    位置编码 (Positional Encoding)

    编码器层 (Encoder Layers) × N

    解码器层 (Decoder Layers) × N

    输出层 (Output Layer)

    概率分布

    ### 核心组件

```python
    #### 1. 自注意力机制 (Self-Attention)

```python

    import torch
    import torch.nn as nn
    import math

    class SelfAttention(nn.Module):
        """自注意力机制"""

        def __init__(self, embed_size, heads):
            super(SelfAttention, self).__init__()
            self.embed_size = embed_size
            self.heads = heads
            self.head_dim = embed_size // heads

            assert (self.head_dim * heads == embed_size), "Embed size needs to be divisible by heads"

            self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
            self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)
            self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)

            self.fc_out = nn.Linear(heads * self.head_dim, embed_size)

        def forward(self, values, keys, query, mask):
            N = query.shape[0]
            value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]

            # 分割成多个头

            values = values.reshape(N, value_len, self.heads, self.head_dim)
            keys = keys.reshape(N, key_len, self.heads, self.head_dim)
            queries = query.reshape(N, query_len, self.heads, self.head_dim)

            # 计算注意力分数

            energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])

            # 应用掩码

            if mask is not None:
                energy = energy.masked_fill(mask == 0, float("-1e20"))

            # 计算注意力权重

            attention = torch.softmax(energy / (self.embed_size ** (1/2)), dim=3)

            # 应用注意力权重

            out = torch.einsum("nhql,nlhd->nqhd", [attention, values])

            # 合并多头

            out = out.reshape(N, query_len, self.heads * self.head_dim)

            return self.fc_out(out)

    > **自注意力机制的作用**:
    > - 捕捉序列中的长距离依赖关系
    > - 动态调整不同位置的重要性
    > - 实现并行计算

    #### 2. 前馈神经网络 (Feed-Forward Network)

    ```python

```python
    class FeedForward(nn.Module):
"""前馈神经网络"""
python
    def __init__(self, embed_size, ff_hidden_size, dropout=0.1):
    super(FeedForward, self).__init__()
    self.linear1 = nn.Linear(embed_size, ff_hidden_size)
    self.linear2 = nn.Linear(ff_hidden_size, embed_size)
    self.dropout = nn.Dropout(dropout)
    self.activation = nn.GELU()

    def forward(self, x):
    x = self.linear1(x)
    x = self.activation(x)
    x = self.dropout(x)
    x = self.linear2(x)
    x = self.dropout(x)
    return x
**前馈神经网络的作用**:
> - 增强模型的表达能力
> - 引入非线性变换
> - 处理特征交互

#### 3. Transformer Block
python

    class TransformerBlock(nn.Module):
        """Transformer块"""

        def __init__(self, embed_size, heads, ff_hidden_size, dropout=0.1):
            super(TransformerBlock, self).__init__()
            self.attention = SelfAttention(embed_size, heads)
            self.norm1 = nn.LayerNorm(embed_size)
            self.norm2 = nn.LayerNorm(embed_size)
            self.feed_forward = FeedForward(embed_size, ff_hidden_size, dropout)
            self.dropout = nn.Dropout(dropout)

        def forward(self, value, key, query, mask):

            # 多头注意力 + 残差连接 + 层归一化

            attention = self.attention(value, key, query, mask)
            x = self.dropout(attention) + query
            x = self.norm1(x)

            # 前馈网络 + 残差连接 + 层归一化

            forward = self.feed_forward(x)
            out = self.dropout(forward) + x
            out = self.norm2(out)

            return out

    > **Transformer Block的作用**:
    > - 结合注意力和前馈网络
    > - 残差连接防止梯度消失
    > - 层归一化稳定训练

    ## 大语言模型的训练

    ### 1. 预训练

```python

    # 示例:大语言模型预训练

    用户请求:
    "解释大语言模型的预训练过程"

    Claude Code 生成的解释:

    ## 预训练过程

    ### 训练目标

    #### 1. 掩码语言模型 (Masked Language Modeling, MLM)

    ```python

    class MaskedLanguageModel(nn.Module):
        """掩码语言模型"""

        def __init__(self, vocab_size, embed_size, num_layers, heads, ff_hidden_size, max_length):
            super(MaskedLanguageModel, self).__init__()
            self.embedding = nn.Embedding(vocab_size, embed_size)
            self.positional_encoding = PositionalEncoding(embed_size, max_length)
            self.transformer_blocks = nn.ModuleList([
                TransformerBlock(embed_size, heads, ff_hidden_size)
                for _ in range(num_layers)
            ])
            self.fc_out = nn.Linear(embed_size, vocab_size)
            self.dropout = nn.Dropout(0.1)

        def forward(self, x, mask):
            # 嵌入 + 位置编码
            x = self.embedding(x)
            x = self.positional_encoding(x)
            x = self.dropout(x)

            # Transformer层
            for block in self.transformer_blocks:
                x = block(x, x, x, mask)

            # 输出层
            out = self.fc_out(x)

            return out

    def mask_tokens(inputs, tokenizer, mlm_probability=0.15):
        """掩码token"""
        labels = inputs.clone()
        probability_matrix = torch.full(labels.shape, mlm_probability)

        special_tokens_mask = [
            tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
        ]
        probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)

        masked_indices = torch.bernoulli(probability_matrix).bool()
        labels[~masked_indices] = -100

        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
        inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)

        indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
        random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
        inputs[indices_random] = random_words[indices_random]

        return inputs, labels

    > **MLM训练目标**:
    > - 随机掩码输入token的一部分
    > - 预测被掩码的token
    > - 学习双向上下文表示

    #### 2. 因果语言模型 (Causal Language Modeling, CLM)

    ```python

```python
    class CausalLanguageModel(nn.Module):
"""因果语言模型"""
python
    def __init__(self, vocab_size, embed_size, num_layers, heads, ff_hidden_size, max_length):
    super(CausalLanguageModel, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embed_size)
    self.positional_encoding = PositionalEncoding(embed_size, max_length)
    self.transformer_blocks = nn.ModuleList([
    TransformerBlock(embed_size, heads, ff_hidden_size)
    for _ in range(num_layers)
    ])
    self.fc_out = nn.Linear(embed_size, vocab_size)
    self.dropout = nn.Dropout(0.1)

    def forward(self, x, mask):
 # 嵌入 + 位置编码

x = self.embedding(x)
x = self.positional_encoding(x)
x = self.dropout(x)

 # Transformer层
python
    for block in self.transformer_blocks:
    x = block(x, x, x, mask)
 # 输出层

out = self.fc_out(x)
bash
    return out

    def create_causal_mask(size):
"""创建因果掩码"""
mask = torch.triu(torch.ones(size, size), diagonal=1)
mask = mask.masked_fill(mask == 1, float('-inf'))
bash
    return mask
**CLM训练目标**:
> - 预测下一个token
> - 学习单向上下文表示
> - 适用于文本生成任务

### 2. 微调
python

    # 示例:大语言模型微调

    用户请求:
    "解释大语言模型的微调过程"

    Claude Code 生成的解释:

    ## 微调过程

    ### 指令微调 (Instruction Tuning)

    class InstructionDataset(torch.utils.data.Dataset):
    """指令数据集"""

    def __init__(self, data, tokenizer, max_length=512):
    self.data = data
    self.tokenizer = tokenizer
    self.max_length = max_length

    def __len__(self):
    return len(self.data)

    def __getitem__(self, idx):
    item = self.data[idx]

     # 格式化指令
    prompt = f"指令: {item['instruction']}\n"
    if 'input' in item and item['input']:
    prompt += f"输入: {item['input']}\n"
    prompt += f"输出: {item['output']}"

     # 编码
    encoded = self.tokenizer(
    prompt,
    max_length=self.max_length,
    padding='max_length',
    truncation=True,
    return_tensors='pt'
    )

    return {
    'input_ids': encoded['input_ids'].squeeze(),
    'attention_mask': encoded['attention_mask'].squeeze(),
    'labels': encoded['input_ids'].squeeze()
    }

    def fine_tune_model(model, train_dataset, val_dataset, config):
    """微调模型"""
     # 数据加载器
    train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=config['batch_size'],
    shuffle=True
    )

    val_loader = torch.utils.data.DataLoader(
    val_dataset,
    batch_size=config['batch_size'],
    shuffle=False
    )

     # 优化器
    optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=config['learning_rate'],
    weight_decay=config['weight_decay']
    )

     # 学习率调度器
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optimizer,
    T_max=config['num_epochs']
    )

     # 训练循环
    for epoch in range(config['num_epochs']):
    model.train()
    total_loss = 0

    for batch in train_loader:
    optimizer.zero_grad()

     # 前向传播
    outputs = model(
    input_ids=batch['input_ids'],
    attention_mask=batch['attention_mask'],
    labels=batch['labels']
    )

    loss = outputs.loss

     # 反向传播
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), config['max_grad_norm'])
    optimizer.step()

    total_loss += loss.item()

     # 验证
    val_loss = evaluate_model(model, val_loader)

     # 学习率调度
    scheduler.step()

    print(f"Epoch {epoch + 1}/{config['num_epochs']}")
    print(f"  Train Loss: {total_loss / len(train_loader):.4f}")
    print(f"  Val Loss: {val_loss:.4f}")

    def evaluate_model(model, val_loader):
    """评估模型"""
    model.eval()
    total_loss = 0

    with torch.no_grad():
    for batch in val_loader:
    outputs = model(
    input_ids=batch['input_ids'],
    attention_mask=batch['attention_mask'],
    labels=batch['labels']
    )
    total_loss += outputs.loss.item()

    return total_loss / len(val_loader)

    **指令微调的作用**:
    - 使模型理解并遵循指令
    - 提高模型在特定任务上的表现
    - 增强模型的交互能力
    ### 3. RLHF (Reinforcement Learning from Human Feedback)

```python

    # 示例:RLHF训练

    用户请求:
    "解释RLHF训练过程"

    Claude Code 生成的解释:

    ## RLHF训练过程

    ### 训练步骤

    #### 步骤1: 奖励模型训练

    ```python

    class RewardModel(nn.Module):
        """奖励模型"""

        def __init__(self, base_model):
            super(RewardModel, self).__init__()
            self.base_model = base_model
            self.reward_head = nn.Linear(base_model.config.hidden_size, 1)

        def forward(self, input_ids, attention_mask):
            outputs = self.base_model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            reward = self.reward_head(outputs.last_hidden_state[:, -1, :])
            return reward

    def train_reward_model(reward_model, comparison_data, config):
        """训练奖励模型"""
        optimizer = torch.optim.AdamW(reward_model.parameters(), lr=config['learning_rate'])

        for epoch in range(config['num_epochs']):
            total_loss = 0

            for batch in comparison_data:
                optimizer.zero_grad()

                # 计算两个输出的奖励

                reward_chosen = reward_model(
                    input_ids=batch['chosen_ids'],
                    attention_mask=batch['chosen_mask']
                )

                reward_rejected = reward_model(
                    input_ids=batch['rejected_ids'],
                    attention_mask=batch['rejected_mask']
                )

                # 计算损失

                loss = -torch.log(torch.sigmoid(reward_chosen - reward_rejected)).mean()

                # 反向传播

                loss.backward()
                optimizer.step()

                total_loss += loss.item()

            print(f"Epoch {epoch + 1}, Loss: {total_loss / len(comparison_data):.4f}")

    #### 步骤2: PPO训练

    ````python

    ````python

    def ppo_train(policy_model, reward_model, data, config):
    """PPO训练"""
    optimizer = torch.optim.AdamW(policy_model.parameters(), lr=config['learning_rate'])

    for epoch in range(config['num_epochs']):
    for batch in data:

     # 生成响应

    with torch.no_grad():
    old_log_probs, old_values = generate_response(
    policy_model,
    batch['input_ids'],
    batch['attention_mask']
    )

     # 计算奖励

    rewards = reward_model(
    input_ids=batch['response_ids'],
    attention_mask=batch['response_mask']
    )

     # 计算优势

    advantages = compute_advantages(rewards, old_values)

     # PPO更新

    for _ in range(config['ppo_epochs']):

     # 重新采样

    log_probs, values = generate_response(
    policy_model,
    batch['input_ids'],
    batch['attention_mask']
    )

     # 计算比率

    ratio = torch.exp(log_probs - old_log_probs)

     # 计算PPO损失

    surr1 = ratio * advantages
    surr2 = torch.clamp(ratio, 1 - config['clip_eps'], 1 + config['clip_eps']) * advantages
    policy_loss = -torch.min(surr1, surr2).mean()

     # 价值函数损失

    value_loss = nn.MSELoss()(values, rewards)

     # 总损失

    loss = policy_loss + config['value_coef'] * value_loss

     # 反向传播

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    ```> **RLHF的作用**:
    > - 使模型输出更符合人类偏好
    > - 提高模型的有用性和安全性
    > - 减少有害输出

    ## 大语言模型的应用

    ### 1. 代码生成

    # 示例:代码生成应用

    用户请求:
    "展示大语言模型在代码生成中的应用"
    Claude Code 生成的示例:

    ## 代码生成应用

    ### 基础代码生成

    ````python
    `python

```python
    def generate_code(prompt, model, tokenizer, max_length=512):
"""生成代码"""
 # 编码输入
inputs = tokenizer(prompt, return_tensors='pt')

 # 生成代码
bash
    with torch.no_grad():
    outputs = model.generate(
    inputs['input_ids'],
    attention_mask=inputs['attention_mask'],
    max_length=max_length,
    temperature=0.7,
    top_p=0.95,
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id
    )
 # 解码输出
generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
bash
    return generated_code
# 使用示例
prompt = """
编写一个Python函数,实现快速排序算法。
"""

code = generate_code(prompt, model, tokenizer)
print(code)

```### 代码补全
python

```python
    def complete_code(partial_code, model, tokenizer, max_length=256):
    """补全代码"""
    # 编码输入
    inputs = tokenizer(partial_code, return_tensors='pt')

    # 补全代码
bash
        with torch.no_grad():
            outputs = model.generate(
                inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                max_length=max_length,
                temperature=0.5,
                top_p=0.9,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id
            )
    # 解码输出
    completed_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
bash
        return completed_code
# 使用示例
partial_code = """
python
    def quick_sort(arr):
        if len(arr) <= 1:
            return arr
        pivot = arr[len(arr) // 2]
    """

    completed_code = complete_code(partial_code, model, tokenizer)
    print(completed_code)
### 代码解释

````python

````python
python
    def explain_code(code, model, tokenizer, max_length=512):
"""解释代码"""
prompt = f"""
请解释以下代码的功能:

```python

{code}

"""

    # 编码输入

    inputs = tokenizer(prompt, return_tensors='pt')

    # 生成解释

    with torch.no_grad():
        outputs = model.generate(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_length=max_length,
            temperature=0.7,
            top_p=0.95,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )

    # 解码输出

    explanation = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return explanation

# 使用示例

code = """
def quick_sort(arr):
    if len(arr) <= 1:
        return arr
    pivot = arr[len(arr) // 2]
    left = [x for x in arr if x < pivot]
    middle = [x for x in arr if x == pivot]
    right = [x for x in arr if x > pivot]
    return quick_sort(left) + middle + quick_sort(right)
"""

explanation = explain_code(code, model, tokenizer)
print(explanation)

基于 MIT 许可发布