ollama/server/thinking.go
Devon Rifkin 77f4594e80 WIP thinking API support
- Allows specifying whether thinking mode should be on or not
- Templates get passed a new option so, e.g., qwen3's template can put
  `/think` or `/no_think` in the system prompt depending on the value of
  the setting
- Add parsing for thinking blocks in both streaming/non-streaming mode
- Update the CLI to make use of these changes

TODO:

- [ ] Don't parse thinking blocks when the user doesn't explicitly set
      the option, to maintain backwards compatibility
- [ ] Warning on CLI when using a non-thinking/older version of a model
      (with an old template)
- [ ] Wire up capabilities fully
- [x] Unify parsing for streaming/non-streaming
- [ ] Update templates
- [ ] Update python/js libraries
- [ ] How to handle differences in models wrt defaults and whether or
      not the thinking ability can even be controlled. If not specified
      by the user, should there be a default or should the template be
      able to check if it was explicitly set?
2025-05-07 16:15:46 -07:00

127 lines
3.8 KiB
Go

package server
import (
"strings"
"unicode"
)
type thinkingParseState int
const (
thinkingParseState_LookingForOpening thinkingParseState = iota
thinkingParseState_Thinking
thinkingParseState_ThinkingDone
)
func (s thinkingParseState) String() string {
switch s {
case thinkingParseState_LookingForOpening:
return "LookingForOpening"
case thinkingParseState_Thinking:
return "Thinking"
case thinkingParseState_ThinkingDone:
return "ThinkingDone"
default:
return "Unknown"
}
}
type thinkingParser struct {
state thinkingParseState
openingTag string
closingTag string
acc strings.Builder
}
// returns the thinking content and the normal content that should be
// immediately sent to the user. It will internally buffer if it needs to see
// more content to disambiguate
func (s *thinkingParser) addContent(content string) (string, string) {
s.acc.WriteString(content)
var thinkingAcc, remainingAcc strings.Builder
var thinking, remaining string
keepLooping := true
// we loop because we might pass through multiple parsing states in a single
// call to addContent, and we want to make sure callers don't have to wait for
// data that's already unambiguous
for keepLooping {
thinking, remaining, keepLooping = eat(s)
thinkingAcc.WriteString(thinking)
remainingAcc.WriteString(remaining)
}
return thinkingAcc.String(), remainingAcc.String()
}
// the additional bool return is true iff we should continue eating
func eat(s *thinkingParser) (string, string, bool) {
switch s.state {
case thinkingParseState_LookingForOpening:
trimmed := strings.TrimLeftFunc(s.acc.String(), unicode.IsSpace)
if strings.HasPrefix(trimmed, s.openingTag) {
after := strings.Join(strings.Split(trimmed, s.openingTag)[1:], s.openingTag)
after = strings.TrimLeftFunc(after, unicode.IsSpace)
// after might contain more than just thinking tokens, so we continue
// parsing instead of returning it as thinking tokens here
s.acc.Reset()
s.acc.WriteString(after)
s.state = thinkingParseState_Thinking
return "", "", true
} else if strings.HasPrefix(s.openingTag, trimmed) {
// partial opening seen, so let's keep accumulating
return "", "", false
} else if trimmed == "" {
// saw whitespace only, so let's keep accumulating
return "", "", false
} else {
// didn't see an opening tag, but we have content, so thinking was skipped
s.state = thinkingParseState_ThinkingDone
// note that we use the original content, not the trimmed one because we
// don't want to eat any whitespace in the real content if there were no
// thinking tags
return "", s.acc.String(), false
}
case thinkingParseState_Thinking:
acc := s.acc.String()
if strings.Contains(acc, s.closingTag) {
split := strings.Split(acc, s.closingTag)
thinking := split[0]
remaining := strings.Join(split[1:], s.closingTag)
remaining = strings.TrimLeftFunc(remaining, unicode.IsSpace)
s.acc.Reset()
s.state = thinkingParseState_ThinkingDone
return thinking, remaining, false
} else if overlapLen := overlap(acc, s.closingTag); overlapLen > 0 {
thinking := acc[:len(acc)-overlapLen]
remaining := acc[len(acc)-overlapLen:]
s.acc.Reset()
// keep track of the candidate closing tag. We have to buffer it until it
// becomes disambiguated
s.acc.WriteString(remaining)
return thinking, "", false
} else {
// purely just thinking tokens, so we can return them
s.acc.Reset()
return acc, "", false
}
case thinkingParseState_ThinkingDone:
acc := s.acc.String()
s.acc.Reset()
return "", acc, false
default:
panic("unknown state")
}
}
// longest overlap between suffix of s and prefix of delim
func overlap(s, delim string) int {
max := min(len(delim), len(s))
for i := max; i > 0; i-- {
if strings.HasSuffix(s, delim[:i]) {
return i
}
}
return 0
}