Skip to content

Added support for the "think" for Ollama #3386

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,7 @@ private ChatResponse internalCall(Prompt prompt, ChatResponse previousChatRespon
if (ollamaResponse.promptEvalCount() != null && ollamaResponse.evalCount() != null) {
generationMetadata = ChatGenerationMetadata.builder()
.finishReason(ollamaResponse.doneReason())
.metadata("thinking", ollamaResponse.message().thinking())
.build();
}

Expand Down Expand Up @@ -460,7 +461,8 @@ else if (message instanceof ToolResponseMessage toolMessage) {
OllamaApi.ChatRequest.Builder requestBuilder = OllamaApi.ChatRequest.builder(requestOptions.getModel())
.stream(stream)
.messages(ollamaMessages)
.options(requestOptions);
.options(requestOptions)
.think(requestOptions.getThink());

if (requestOptions.getFormat() != null) {
requestBuilder.format(requestOptions.getFormat());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
* @author Christian Tzolov
* @author Thomas Vitale
* @author Jonghoon Park
* @author Sun Yuhan
* @since 0.8.0
*/
// @formatter:off
Expand Down Expand Up @@ -251,6 +252,7 @@ public Flux<ProgressResponse> pullModel(PullModelRequest pullModelRequest) {
*
* @param role The role of the message of type {@link Role}.
* @param content The content of the message.
* @param thinking The thinking of the model.
* @param images The list of base64-encoded images to send with the message.
* Requires multimodal models such as llava or bakllava.
* @param toolCalls The relevant tool call.
Expand All @@ -260,6 +262,7 @@ public Flux<ProgressResponse> pullModel(PullModelRequest pullModelRequest) {
public record Message(
@JsonProperty("role") Role role,
@JsonProperty("content") String content,
@JsonProperty("thinking") String thinking,
@JsonProperty("images") List<String> images,
@JsonProperty("tool_calls") List<ToolCall> toolCalls) {

Expand Down Expand Up @@ -321,6 +324,7 @@ public static class Builder {

private final Role role;
private String content;
private String thinking;
private List<String> images;
private List<ToolCall> toolCalls;

Expand All @@ -333,6 +337,11 @@ public Builder content(String content) {
return this;
}

public Builder thinking(String thinking) {
this.thinking = thinking;
return this;
}

public Builder images(List<String> images) {
this.images = images;
return this;
Expand All @@ -344,7 +353,7 @@ public Builder toolCalls(List<ToolCall> toolCalls) {
}

public Message build() {
return new Message(this.role, this.content, this.images, this.toolCalls);
return new Message(this.role, this.content, this.thinking, this.images, this.toolCalls);
}
}
}
Expand All @@ -359,6 +368,7 @@ public Message build() {
* @param keepAlive Controls how long the model will stay loaded into memory following this request (default: 5m).
* @param tools List of tools the model has access to.
* @param options Model-specific options. For example, "temperature" can be set through this field, if the model supports it.
* @param think The model should think before responding, if the model supports it.
* You can use the {@link OllamaOptions} builder to create the options then {@link OllamaOptions#toMap()} to convert the options into a map.
*
* @see <a href=
Expand All @@ -375,7 +385,8 @@ public record ChatRequest(
@JsonProperty("format") Object format,
@JsonProperty("keep_alive") String keepAlive,
@JsonProperty("tools") List<Tool> tools,
@JsonProperty("options") Map<String, Object> options
@JsonProperty("options") Map<String, Object> options,
@JsonProperty("think") Boolean think
) {

public static Builder builder(String model) {
Expand Down Expand Up @@ -448,6 +459,7 @@ public static class Builder {
private String keepAlive;
private List<Tool> tools = List.of();
private Map<String, Object> options = Map.of();
private Boolean think;

public Builder(String model) {
Assert.notNull(model, "The model can not be null.");
Expand Down Expand Up @@ -492,8 +504,13 @@ public Builder options(OllamaOptions options) {
return this;
}

public Builder think(Boolean think) {
this.think = think;
return this;
}

public ChatRequest build() {
return new ChatRequest(this.model, this.messages, this.stream, this.format, this.keepAlive, this.tools, this.options);
return new ChatRequest(this.model, this.messages, this.stream, this.format, this.keepAlive, this.tools, this.options, this.think);
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

/**
* @author Christian Tzolov
* @author Sun Yuhan
* @since 1.0.0
*/
public final class OllamaApiHelper {
Expand Down Expand Up @@ -81,12 +82,18 @@ public static ChatResponse merge(ChatResponse previous, ChatResponse current) {
private static OllamaApi.Message merge(OllamaApi.Message previous, OllamaApi.Message current) {

String content = mergeContent(previous, current);
String thinking = mergeThinking(previous, current);
OllamaApi.Message.Role role = (current.role() != null ? current.role() : previous.role());
role = (role != null ? role : OllamaApi.Message.Role.ASSISTANT);
List<String> images = mergeImages(previous, current);
List<OllamaApi.Message.ToolCall> toolCalls = mergeToolCall(previous, current);

return OllamaApi.Message.builder(role).content(content).images(images).toolCalls(toolCalls).build();
return OllamaApi.Message.builder(role)
.content(content)
.thinking(thinking)
.images(images)
.toolCalls(toolCalls)
.build();
}

private static Instant merge(Instant previous, Instant current) {
Expand Down Expand Up @@ -134,6 +141,17 @@ private static String mergeContent(OllamaApi.Message previous, OllamaApi.Message
return previous.content() + current.content();
}

private static String mergeThinking(OllamaApi.Message previous, OllamaApi.Message current) {
if (previous == null || previous.thinking() == null) {
return (current != null ? current.thinking() : null);
}
if (current == null || current.thinking() == null) {
return (previous != null ? previous.thinking() : null);
}

return previous.thinking() + current.thinking();
}

private static List<OllamaApi.Message.ToolCall> mergeToolCall(OllamaApi.Message previous,
OllamaApi.Message current) {
if (previous == null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
*
* @author Siarhei Blashuk
* @author Thomas Vitale
* @author Sun Yuhan
* @since 1.0.0
*/
public enum OllamaModel implements ChatModelDescription {
Expand All @@ -32,6 +33,21 @@ public enum OllamaModel implements ChatModelDescription {
*/
QWEN_2_5_7B("qwen2.5"),

/**
* Qwen3
*/
QWEN_3_8B("qwen3"),

/**
* Qwen3 1.7b
*/
QWEN_3_1_7_B("qwen3:1.7b"),

/**
* Qwen3 0.6b
*/
QWEN_3_06B("qwen3:0.6b"),

/**
* QwQ is the reasoning model of the Qwen series.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
* @author Christian Tzolov
* @author Thomas Vitale
* @author Ilayaperumal Gopinathan
* @author Sun Yuhan
* @since 0.8.0
* @see <a href=
* "https://github.com/ollama/ollama/blob/main/docs/modelfile.md#valid-parameters-and-values">Ollama
Expand Down Expand Up @@ -318,6 +319,14 @@ public class OllamaOptions implements ToolCallingChatOptions, EmbeddingOptions {
@JsonProperty("truncate")
private Boolean truncate;

/**
* The model should think before responding, if supported.
* If this value is not specified, it defaults to null, and Ollama will return
* the thought process within the `content` field of the response, wrapped in `&lt;thinking&gt;` tags.
*/
@JsonProperty("think")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems like 'think' is not a part of the options map in ollama, but a 'top level' field in the request object.
In https://ollama.com/blog/thinking there is the example

curl http://localhost:11434/api/chat -d '{
  "model": "deepseek-r1",
  "messages": [
    {
      "role": "user",
      "content": "how many r in the word strawberry?"
    }
  ],
  "think": true,
  "stream": false
}'

and the golang type supporting this feature also shows the same structure.

https://github.com/ollama/ollama/blob/45f56355d557b7130c7c07bbd6e1b634a758d946/api/types.go#L91

So it shouldn't be added to the options map.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment in the code

and Ollama will return
	 * the thought process within the `content` field of the response, wrapped in `&lt;thinking&gt;` tags.

seems to contradict what was documented on the ollama web site that shows the 'think' response as a separate field from 'content', and not nested inside the 'content' field.

Copy link
Member

@markpollack markpollack Jun 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we want to also expose it in OllamaOptions as that right now would be the only way to pass in this feature flag when making calls via ChatModel or ChatClient. I think if the feature of enabling thinking mode is implemented from a ChatModel or ChatClient level the right solution will present itself. Can you improve this PR to handle this scenario please?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we want to also expose it in OllamaOptions as that right now would be the only way to pass in this feature flag when making calls via ChatModel or ChatClient. I think if the feature of enabling thinking mode is implemented from a ChatModel or ChatClient level the right solution will present itself. Can you improve this PR to handle this scenario please?

Of course, no problem. I will continue improving this PR and update my progress here in a timely manner.

Before proceeding, I want to confirm whether I’ve correctly understood your point: Are you suggesting that we should not add the think flag in OllamaOptions, but instead make adjustments at the OllamaChatModel level? Or do you mean that we should implement support for think at the ChatModel or ChatClient level? If it's the latter, I think we would also need to adjust the implementations of different ChatModels to support this option (of course, depending on whether the underlying model actually supports it).

Copy link
Contributor Author

@sunyuhan1998 sunyuhan1998 Jun 12, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

seems to contradict what was documented on the ollama web site that shows the 'think' response as a separate field from 'content', and not nested inside the 'content' field.

Firstly, the content in the comment is actually a summary I derived through practical testing.

I think this is a form of backward compatibility Ollama implemented for users still employing the old parameter-passing method. Before Ollama supported the "think" flag, if we made a request to a model that supports thinking like this:

curl http://localhost:11434/api/chat -d '{
  "model": "qwen3:4b",
  "messages": [
    { "role": "user", "content": "why is the sky blue?" }
  ]
}'

Ollama would enable thinking by default and return the thought process wrapped in <thinking> tags within the content field of the response. The response would look something like this:

{"model":"qwen3:4b","created_at":"2025-06-12T12:17:08.385341Z","message":{"role":"assistant","content":"\u003cthink\u003e"},"done":false}
{"model":"qwen3:4b","created_at":"2025-06-12T12:17:08.402756Z","message":{"role":"assistant","content":"\n"},"done":false}
{"model":"qwen3:4b","created_at":"2025-06-12T12:17:08.420851Z","message":{"role":"assistant","content":"Okay"},"done":false}
{"model":"qwen3:4b","created_at":"2025-06-12T12:17:08.439825Z","message":{"role":"assistant","content":","},"done":false}
{"model":"qwen3:4b","created_at":"2025-06-12T12:17:08.457618Z","message":{"role":"assistant","content":" the"},"done":false}
{"model":"qwen3:4b","created_at":"2025-06-12T12:17:08.474711Z","message":{"role":"assistant","content":" user"},"done":false}
{"model":"qwen3:4b","created_at":"2025-06-12T12:17:08.491833Z","message":{"role":"assistant","content":" is"},"done":false}
{"model":"qwen3:4b","created_at":"2025-06-12T12:17:08.509124Z","message":{"role":"assistant","content":" asking"},"done":false}

This remains the case in the latest version of Ollama: if the think flag is not specified, the behavior of Ollama remains unchanged from before. This is what I intended to express in the comment:

If this value is not specified, it defaults to null, and Ollama will return the thought process within the content field of the response, wrapped in <thinking> tags.

Only when we specify the think flag will Ollama return the thought process in the thinking field of the response:

curl http://localhost:11434/api/chat -d '{
  "model": "qwen3:4b",
  "messages": [
    { "role": "user", "content": "why is the sky blue?" }
  ],
  "think": true
}'

Response:

{"model":"qwen3:4b","created_at":"2025-06-12T12:22:48.135211Z","message":{"role":"assistant","content":"","thinking":"Okay"},"done":false}
{"model":"qwen3:4b","created_at":"2025-06-12T12:22:48.152511Z","message":{"role":"assistant","content":"","thinking":","},"done":false}
{"model":"qwen3:4b","created_at":"2025-06-12T12:22:48.169911Z","message":{"role":"assistant","content":"","thinking":" the"},"done":false}
{"model":"qwen3:4b","created_at":"2025-06-12T12:22:48.187023Z","message":{"role":"assistant","content":"","thinking":" user"},"done":false}
{"model":"qwen3:4b","created_at":"2025-06-12T12:22:48.204039Z","message":{"role":"assistant","content":"","thinking":" is"},"done":false}
{"model":"qwen3:4b","created_at":"2025-06-12T12:22:48.221233Z","message":{"role":"assistant","content":"","thinking":" asking"},"done":false}

I think it's the same for Spring AI. We should maintain compatibility with users who are using older versions, meaning that if the "think" flag is not specified, the returned format should remain unchanged.

private Boolean think;

@JsonIgnore
private Boolean internalToolExecutionEnabled;

Expand Down Expand Up @@ -365,6 +374,7 @@ public static OllamaOptions fromOptions(OllamaOptions fromOptions) {
.format(fromOptions.getFormat())
.keepAlive(fromOptions.getKeepAlive())
.truncate(fromOptions.getTruncate())
.think(fromOptions.getThink())
.useNUMA(fromOptions.getUseNUMA())
.numCtx(fromOptions.getNumCtx())
.numBatch(fromOptions.getNumBatch())
Expand Down Expand Up @@ -704,6 +714,14 @@ public void setTruncate(Boolean truncate) {
this.truncate = truncate;
}

public Boolean getThink() {
return this.think;
}

public void setThink(Boolean think) {
this.think = think;
}

@Override
@JsonIgnore
public List<ToolCallback> getToolCallbacks() {
Expand Down Expand Up @@ -804,7 +822,8 @@ public boolean equals(Object o) {
&& Objects.equals(this.repeatPenalty, that.repeatPenalty)
&& Objects.equals(this.presencePenalty, that.presencePenalty)
&& Objects.equals(this.frequencyPenalty, that.frequencyPenalty)
&& Objects.equals(this.mirostat, that.mirostat) && Objects.equals(this.mirostatTau, that.mirostatTau)
&& Objects.equals(this.think, that.think) && Objects.equals(this.mirostat, that.mirostat)
&& Objects.equals(this.mirostatTau, that.mirostatTau)
&& Objects.equals(this.mirostatEta, that.mirostatEta)
&& Objects.equals(this.penalizeNewline, that.penalizeNewline) && Objects.equals(this.stop, that.stop)
&& Objects.equals(this.toolCallbacks, that.toolCallbacks)
Expand All @@ -814,13 +833,13 @@ public boolean equals(Object o) {

@Override
public int hashCode() {
return Objects.hash(this.model, this.format, this.keepAlive, this.truncate, this.useNUMA, this.numCtx,
this.numBatch, this.numGPU, this.mainGPU, this.lowVRAM, this.f16KV, this.logitsAll, this.vocabOnly,
this.useMMap, this.useMLock, this.numThread, this.numKeep, this.seed, this.numPredict, this.topK,
this.topP, this.minP, this.tfsZ, this.typicalP, this.repeatLastN, this.temperature, this.repeatPenalty,
this.presencePenalty, this.frequencyPenalty, this.mirostat, this.mirostatTau, this.mirostatEta,
this.penalizeNewline, this.stop, this.toolCallbacks, this.toolNames, this.internalToolExecutionEnabled,
this.toolContext);
return Objects.hash(this.model, this.format, this.keepAlive, this.truncate, this.think, this.useNUMA,
this.numCtx, this.numBatch, this.numGPU, this.mainGPU, this.lowVRAM, this.f16KV, this.logitsAll,
this.vocabOnly, this.useMMap, this.useMLock, this.numThread, this.numKeep, this.seed, this.numPredict,
this.topK, this.topP, this.minP, this.tfsZ, this.typicalP, this.repeatLastN, this.temperature,
this.repeatPenalty, this.presencePenalty, this.frequencyPenalty, this.mirostat, this.mirostatTau,
this.mirostatEta, this.penalizeNewline, this.stop, this.toolCallbacks, this.toolNames,
this.internalToolExecutionEnabled, this.toolContext);
}

public static class Builder {
Expand Down Expand Up @@ -852,6 +871,11 @@ public Builder truncate(Boolean truncate) {
return this;
}

public Builder think(Boolean think) {
this.options.think = think;
return this;
}

public Builder useNUMA(Boolean useNUMA) {
this.options.useNUMA = useNUMA;
return this;
Expand Down
Loading