GGML

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
std::string gpt_random_prompt(std::mt19937 & rng) {
const int r = rng() % 10;
switch (r) {
case 0: return "So";
case 1: return "Once upon a time";
case 2: return "When";
case 3: return "The";
case 4: return "After";
case 5: return "If";
case 6: return "import";
case 7: return "He";
case 8: return "She";
case 9: return "They";
}

return "The";
}

vocab

vocabulary

将自然语言文本转换为计算机可以处理的数字表示

文本处理的典型流程包括两个步骤:

分词(Tokenization):将输入的自然语言文本按照某种规则分割成一系列token,可以是单词、子词或者字符等

查表(Lookup):将分词得到的每个token在词汇表中查找对应的数值ID

gpt2_eval

1
2
3
4
5
6
7
bool gpt2_eval(
const gpt2_model & model,
const int n_threads,
const int n_past,
const std::vector<gpt_vocab::id> & embd_inp,
std::vector<float> & embd_w,
size_t & mem_per_token);
  • model:当前加载的 GPT‑2 权重和超参,gpt2_model 内包含所有 ggml 张量(embedding、各层权重、KV 缓存等),前向时会使用这些权重。
  • n_threads:执行 ggml_graph_compute_with_ctx 时使用的线程数,即前向推理的并行度(对应命令行 -t/–threads)。
  • n_past:已经处理过的 token 数,也就是 KV 缓存里原本保存的上下文长度;自注意力里用它决定写入/读取 memory_k/v 的偏移,并在 causal mask 时屏蔽历史以外的位置。
  • embd_inp:这次要送进模型的一批 token id(通常是 prompt 的下一段或者刚采样出的 token),类型是 std::vector<gpt_vocab::id>。
  • embd_w:输出参数,函数会把前向结果(最后一个 token 的 logits,长度 = n_vocab)写入这个 vector,供外层采样使用。
  • mem_per_token:用于估算单个 token 前向所需的临时内存;第一次调用时传入 0,函数内部会根据 ggml_used_mem(ctx0)/N 填充它,之后外层就可以据此调整 buffer 大小以避免频繁 realloc。

因此,gpt2_eval 接受模型和输入 token,以及线程数和上下文长度,计算出最终 logits 并告知内存占用,外层生成循环再根据这些 logits 采样下一个 token。

ggml_tensor

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
struct ggml_tensor {
enum ggml_type type;
struct ggml_backend_buffer * buffer;
int64_t ne[GGML_MAX_DIMS]; // 维度大小
size_t nb[GGML_MAX_DIMS]; // 每维 stride
enum ggml_op op; // 该张量对应的算子
int32_t op_params[...];
int32_t flags;
struct ggml_tensor * src[GGML_MAX_SRC]; // 源张量指针
struct ggml_tensor * view_src;
size_t view_offs;
void * data; // 真实数据指针
char name[GGML_MAX_NAME];
void * extra;
char padding[8];
};

ggml_op

1
2
// available tensor operations:
enum ggml_op {

gpt2-ctx-main

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
int main(int argc, char ** argv) {
ggml_time_init();

const int64_t t_main_start_us = ggml_time_us();

gpt_params params;
params.model = "models/gpt-2-117M/ggml-model.bin";

if (gpt_params_parse(argc, argv, params) == false) {
return 1;
}

if (params.seed < 0) {
params.seed = time(NULL);
}

printf("%s: seed = %d\n", __func__, params.seed);

std::mt19937 rng(params.seed);
if (params.prompt.empty()) {
params.prompt = gpt_random_prompt(rng);
}

int64_t t_load_us = 0;

gpt_vocab vocab;
gpt2_model model;

// load the model
{
const int64_t t_start_us = ggml_time_us();

if (!gpt2_model_load(params.model, model, vocab)) {
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
return 1;
}

t_load_us = ggml_time_us() - t_start_us;

test_gpt_tokenizer(vocab, params.token_test);
}

int n_past = 0;

int64_t t_sample_us = 0;
int64_t t_predict_us = 0;

std::vector<float> logits;

// tokenize the prompt
std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);

params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());

printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, embd_inp.size());
for (int i = 0; i < std::min(8, (int) embd_inp.size()); i++) {
printf("%d ", embd_inp[i]);
}
printf("\n\n");

// submit the input prompt token-by-token
// this reduces the memory usage during inference, at the cost of a bit of speed at the beginning
std::vector<gpt_vocab::id> embd;

// determine the required inference memory per token:
size_t mem_per_token = 0;
gpt2_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);

for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
// predict
if (embd.size() > 0) {
const int64_t t_start_us = ggml_time_us();

if (!gpt2_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
printf("Failed to predict\n");
return 1;
}

t_predict_us += ggml_time_us() - t_start_us;
}

n_past += embd.size();
embd.clear();

if (i >= embd_inp.size()) {
// sample next token
const int top_k = params.top_k;
const float top_p = params.top_p;
const float temp = params.temp;

const int n_vocab = model.hparams.n_vocab;

gpt_vocab::id id = 0;

{
const int64_t t_start_sample_us = ggml_time_us();

id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);

t_sample_us += ggml_time_us() - t_start_sample_us;
}

// add it to the context
embd.push_back(id);
} else {
// if here, it means we are still processing the input prompt
for (size_t k = i; k < embd_inp.size(); k++) {
embd.push_back(embd_inp[k]);
if (int32_t(embd.size()) >= params.n_batch) {
break;
}
}
i += embd.size() - 1;
}

// display text
for (auto id : embd) {
printf("%s", vocab.id_to_token[id].c_str());
}
fflush(stdout);

// end of text token
if (embd.back() == 50256) {
break;
}
}

// report timing
{
const int64_t t_main_end_us = ggml_time_us();

printf("\n\n");
printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
}

ggml_free(model.ctx_w);

return 0;
}

前后向传播

前向

  • 在应用层(如 examples/gpt-2/main-ctx.cpp (lines 392-695)),前向传播通过 ggml 的“构图 + 执行”模式完成:先在一个新的 ggml_context 里创建输入张量、调用 ggml_mul_mat、ggml_norm、ggml_soft_max_inplace 等算子堆叠出完整的 Transformer 计算图,再用 ggml_build_forward_expand 和 ggml_graph_compute_with_ctx 执行。这一步不需要手写矩阵运算,所有算子都在 ggml C 核心实现(ggml/src/ggml.c)里定义。

反向

  • ggml 在 ggml/src/ggml.c (lines 6025-6669) 中实现了自动求导:ggml_compute_backward() 根据计算图节点的 op 类型,依次调用 ggml_add_or_set、ggml_mul、ggml_repeat_back 等基本算子累加梯度。例如 GGML_OP_MUL_MAT 对应“上游梯度 × 权重转置”/“输入转置 × 上游梯度”的标准公式;GGML_OP_SOFT_MAX、GGML_OP_ROPE 等也有专门的 back 函数。图结构的遍历、梯度缓存(cgraph->grads)和梯度累加逻辑都在这段代码中定义。

前向是“在 C++ 层用 ggml API 描述计算图,然后由 ggml 核心算子执行”;

反向则由 ggml_compute_backward 针对每种 ggml_op 按公式生成梯度张量并在图上回传

ggml_mul_mat

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
// ggml_mul_mat
static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return (t0->ne[0] == t1->ne[0]) &&
(t1->ne[2]%t0->ne[2] == 0) && // verify t0 is broadcastable
(t1->ne[3]%t0->ne[3] == 0);
}
struct ggml_tensor * ggml_mul_mat(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b) {
GGML_ASSERT(ggml_can_mul_mat(a, b));
GGML_ASSERT(!ggml_is_transposed(a));
const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] };
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
result->op = GGML_OP_MUL_MAT;
result->src[0] = a;
result->src[1] = b;
return result;
}
void ggml_mul_mat_set_prec(
struct ggml_tensor * a,
enum ggml_prec prec) {
GGML_ASSERT(a->op == GGML_OP_MUL_MAT);
const int32_t prec_i32 = (int32_t) prec;
ggml_set_op_params_i32(a, 0, prec_i32);
}

ggml_norm

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
// ggml_norm

static struct ggml_tensor * ggml_norm_impl(
struct ggml_context * ctx,
struct ggml_tensor * a,
float eps,
bool inplace) {
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

ggml_set_op_params(result, &eps, sizeof(eps));

result->op = GGML_OP_NORM;
result->src[0] = a;

return result;
}

struct ggml_tensor * ggml_norm(
struct ggml_context * ctx,
struct ggml_tensor * a,
float eps) {
return ggml_norm_impl(ctx, a, eps, false);
}

struct ggml_tensor * ggml_norm_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
float eps) {
return ggml_norm_impl(ctx, a, eps, true);
}

ggml_softmax

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
// ggml_soft_max

static struct ggml_tensor * ggml_soft_max_impl(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * mask,
float scale,
float max_bias,
bool inplace) {
GGML_ASSERT(ggml_is_contiguous(a));

if (mask) {
GGML_ASSERT(mask->type == GGML_TYPE_F16 || mask->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_is_contiguous(mask));
GGML_ASSERT(mask->ne[0] == a->ne[0]);
GGML_ASSERT(mask->ne[1] >= a->ne[1]);
GGML_ASSERT(a->ne[2]%mask->ne[2] == 0);
GGML_ASSERT(a->ne[3]%mask->ne[3] == 0);
}

if (max_bias > 0.0f) {
GGML_ASSERT(mask);
}

struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

float params[] = { scale, max_bias };
ggml_set_op_params(result, params, sizeof(params));

result->op = GGML_OP_SOFT_MAX;
result->src[0] = a;
result->src[1] = mask;

return result;
}

struct ggml_tensor * ggml_soft_max(
struct ggml_context * ctx,
struct ggml_tensor * a) {
return ggml_soft_max_impl(ctx, a, NULL, 1.0f, 0.0f, false);
}

struct ggml_tensor * ggml_soft_max_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a) {
return ggml_soft_max_impl(ctx, a, NULL, 1.0f, 0.0f, true);
}

struct ggml_tensor * ggml_soft_max_ext(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * mask,
float scale,
float max_bias) {
return ggml_soft_max_impl(ctx, a, mask, scale, max_bias, false);
}

struct ggml_tensor * ggml_soft_max_ext_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * mask,
float scale,
float max_bias) {
return ggml_soft_max_impl(ctx, a, mask, scale, max_bias, true);
}

void ggml_soft_max_add_sinks(
struct ggml_tensor * a,
struct ggml_tensor * sinks) {
if (!sinks) {
a->src[2] = NULL;
return;
}

GGML_ASSERT(a->op == GGML_OP_SOFT_MAX);
GGML_ASSERT(a->src[2] == NULL);
GGML_ASSERT(a->src[0]->ne[2] == sinks->ne[0]);
GGML_ASSERT(sinks->type == GGML_TYPE_F32);

a->src[2] = sinks;
}

这就是 C 语言中“返回结构体指针”的标准写法:函数 ggml_soft_max_inplace 的返回类型是 struct ggml_tensor ,表示返回一个指向 ggml_tensor 结构体的指针。因为源代码没有用 typedef 给 struct ggml_tensor 起别名,所以在函数声明时必须写成 struct ggml_tensor ;如果像 C++ 里常见的那样 typedef struct ggml_tensor ggml_tensor;,就可以写成 ggml_tensor *。这一写法看起来“奇怪”只是因为 ggml 为了兼容纯 C 编译器,沿用了最传统的 C 风格。函数体的意思就是把参数 a 和默认参数一起传给内部实现 ggml_soft_max_impl,然后返回它的结果。

ggml_rope

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
// ggml_rope

static struct ggml_tensor * ggml_rope_impl(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c,
int n_dims,
int sections[GGML_MROPE_SECTIONS],
int mode,
int n_ctx_orig,
float freq_base,
float freq_scale,
float ext_factor,
float attn_factor,
float beta_fast,
float beta_slow,
bool inplace) {
GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");

GGML_ASSERT(ggml_is_vector(b));
GGML_ASSERT(b->type == GGML_TYPE_I32);

bool mrope_used = mode & GGML_ROPE_TYPE_MROPE;
if (mrope_used) {
GGML_ASSERT(a->ne[2] * 4 == b->ne[0]); // mrope expecting 4 position ids per token
} else {
GGML_ASSERT(a->ne[2] == b->ne[0]);
}

if (c) {
GGML_ASSERT(c->type == GGML_TYPE_F32);
GGML_ASSERT(c->ne[0] >= n_dims / 2);
}

struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

int32_t params[15] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
memcpy(params + 5, &freq_base, sizeof(float));
memcpy(params + 6, &freq_scale, sizeof(float));
memcpy(params + 7, &ext_factor, sizeof(float));
memcpy(params + 8, &attn_factor, sizeof(float));
memcpy(params + 9, &beta_fast, sizeof(float));
memcpy(params + 10, &beta_slow, sizeof(float));
if (mrope_used && sections) {
memcpy(params + 11, sections, sizeof(int32_t) * GGML_MROPE_SECTIONS);
} else {
memset(params + 11, 0, sizeof(int32_t) * GGML_MROPE_SECTIONS);
}
ggml_set_op_params(result, params, sizeof(params));

result->op = GGML_OP_ROPE;
result->src[0] = a;
result->src[1] = b;
result->src[2] = c;

return result;
}

struct ggml_tensor * ggml_rope(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
int n_dims,
int mode) {
return ggml_rope_impl(
ctx, a, b, NULL, n_dims, NULL, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, false
);
}

struct ggml_tensor * ggml_rope_multi(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c,
int n_dims,
int sections[GGML_MROPE_SECTIONS],
int mode,
int n_ctx_orig,
float freq_base,
float freq_scale,
float ext_factor,
float attn_factor,
float beta_fast,
float beta_slow) {
return ggml_rope_impl(
ctx, a, b, c, n_dims, sections, mode, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow, false
);
}

struct ggml_tensor * ggml_rope_multi_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c,
int n_dims,
int sections[GGML_MROPE_SECTIONS],
int mode,
int n_ctx_orig,
float freq_base,
float freq_scale,
float ext_factor,
float attn_factor,
float beta_fast,
float beta_slow) {
return ggml_rope_impl(
ctx, a, b, c, n_dims, sections, mode, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow, true
);
}

struct ggml_tensor * ggml_rope_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
int n_dims,
int mode) {
return ggml_rope_impl(
ctx, a, b, NULL, n_dims, NULL, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, true
);
}

struct ggml_tensor * ggml_rope_ext(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c,
int n_dims,
int mode,
int n_ctx_orig,
float freq_base,
float freq_scale,
float ext_factor,
float attn_factor,
float beta_fast,
float beta_slow) {
return ggml_rope_impl(
ctx, a, b, c, n_dims, NULL, mode, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow, false
);
}

struct ggml_tensor * ggml_rope_ext_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c,
int n_dims,
int mode,
int n_ctx_orig,
float freq_base,
float freq_scale,
float ext_factor,
float attn_factor,
float beta_fast,
float beta_slow) {
return ggml_rope_impl(
ctx, a, b, c, n_dims, NULL, mode, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow, true
);
}

struct ggml_tensor * ggml_rope_custom(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
int n_dims,
int mode,
int n_ctx_orig,
float freq_base,
float freq_scale,
float ext_factor,
float attn_factor,
float beta_fast,
float beta_slow) {
return ggml_rope_impl(
ctx, a, b, NULL, n_dims, NULL, mode, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow, false
);
}

struct ggml_tensor * ggml_rope_custom_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
int n_dims,
int mode,
int n_ctx_orig,
float freq_base,
float freq_scale,
float ext_factor,
float attn_factor,
float beta_fast,
float beta_slow) {
return ggml_rope_impl(
ctx, a, b, NULL, n_dims, NULL, mode, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow, true
);
}

// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
// `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
static float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) {
return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
}

void ggml_rope_yarn_corr_dims(
int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]
) {
// start and end correction dims
float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base));
float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base));
dims[0] = MAX(0, start);
dims[1] = MIN(n_dims - 1, end);
}

// ggml_rope_back

struct ggml_tensor * ggml_rope_ext_back(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c,
int n_dims,
int mode,
int n_ctx_orig,
float freq_base,
float freq_scale,
float ext_factor,
float attn_factor,
float beta_fast,
float beta_slow) {
struct ggml_tensor * result = ggml_rope_ext(
ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
result->op = GGML_OP_ROPE_BACK;
return result;
}

struct ggml_tensor * ggml_rope_multi_back(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c,
int n_dims,
int sections[4],
int mode,
int n_ctx_orig,
float freq_base,
float freq_scale,
float ext_factor,
float attn_factor,
float beta_fast,
float beta_slow) {
struct ggml_tensor * result = ggml_rope_multi(
ctx, a, b, c, n_dims, sections, mode, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
result->op = GGML_OP_ROPE_BACK;
return result;
}