Skip to content

Commit 1d58a06

Browse files
committed
feat(vertexai): Gemini multimodal output
1 parent dd6a8f0 commit 1d58a06

File tree

9 files changed

+244
-2
lines changed

9 files changed

+244
-2
lines changed

common/api-review/vertexai.api.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@ export { Date_2 as Date }
124124
export interface EnhancedGenerateContentResponse extends GenerateContentResponse {
125125
// (undocumented)
126126
functionCalls: () => FunctionCall[] | undefined;
127+
inlineData: () => GenerativeContentBlob[] | undefined;
127128
text: () => string;
128129
}
129130

@@ -304,6 +305,8 @@ export interface GenerationConfig {
304305
// (undocumented)
305306
presencePenalty?: number;
306307
responseMimeType?: string;
308+
// @beta
309+
responseModalities?: ResponseModality[];
307310
responseSchema?: TypedSchema | SchemaRequest;
308311
// (undocumented)
309312
stopSequences?: string[];
@@ -596,6 +599,15 @@ export interface RequestOptions {
596599
timeout?: number;
597600
}
598601

602+
// @beta
603+
export const ResponseModality: {
604+
readonly TEXT: "TEXT";
605+
readonly IMAGE: "IMAGE";
606+
};
607+
608+
// @beta
609+
export type ResponseModality = (typeof ResponseModality)[keyof typeof ResponseModality];
610+
599611
// @public (undocumented)
600612
export interface RetrievedContextAttribution {
601613
// (undocumented)

docs-devsite/vertexai.enhancedgeneratecontentresponse.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ export interface EnhancedGenerateContentResponse extends GenerateContentResponse
2424
| Property | Type | Description |
2525
| --- | --- | --- |
2626
| [functionCalls](./vertexai.enhancedgeneratecontentresponse.md#enhancedgeneratecontentresponsefunctioncalls) | () =&gt; [FunctionCall](./vertexai.functioncall.md#functioncall_interface)<!-- -->\[\] \| undefined | |
27+
| [inlineData](./vertexai.enhancedgeneratecontentresponse.md#enhancedgeneratecontentresponseinlinedata) | () =&gt; [GenerativeContentBlob](./vertexai.generativecontentblob.md#generativecontentblob_interface)<!-- -->\[\] \| undefined | Aggregates and returns all [InlineDataPart](./vertexai.inlinedatapart.md#inlinedatapart_interface) from the [GenerateContentResponse](./vertexai.generatecontentresponse.md#generatecontentresponse_interface)<!-- -->'s first candidate. |
2728
| [text](./vertexai.enhancedgeneratecontentresponse.md#enhancedgeneratecontentresponsetext) | () =&gt; string | Returns the text string from the response, if available. Throws if the prompt or candidate was blocked. |
2829
2930
## EnhancedGenerateContentResponse.functionCalls
@@ -34,6 +35,16 @@ export interface EnhancedGenerateContentResponse extends GenerateContentResponse
3435
functionCalls: () => FunctionCall[] | undefined;
3536
```
3637
38+
## EnhancedGenerateContentResponse.inlineData
39+
40+
Aggregates and returns all [InlineDataPart](./vertexai.inlinedatapart.md#inlinedatapart_interface) from the [GenerateContentResponse](./vertexai.generatecontentresponse.md#generatecontentresponse_interface)<!-- -->'s first candidate.
41+
42+
<b>Signature:</b>
43+
44+
```typescript
45+
inlineData: () => GenerativeContentBlob[] | undefined;
46+
```
47+
3748
## EnhancedGenerateContentResponse.text
3849
3950
Returns the text string from the response, if available. Throws if the prompt or candidate was blocked.

docs-devsite/vertexai.generationconfig.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,12 @@ export interface GenerationConfig
2727
| [maxOutputTokens](./vertexai.generationconfig.md#generationconfigmaxoutputtokens) | number | |
2828
| [presencePenalty](./vertexai.generationconfig.md#generationconfigpresencepenalty) | number | |
2929
| [responseMimeType](./vertexai.generationconfig.md#generationconfigresponsemimetype) | string | Output response MIME type of the generated candidate text. Supported MIME types are <code>text/plain</code> (default, text output), <code>application/json</code> (JSON response in the candidates), and <code>text/x.enum</code>. |
30+
<<<<<<< HEAD
3031
| [responseSchema](./vertexai.generationconfig.md#generationconfigresponseschema) | [TypedSchema](./vertexai.md#typedschema) \| [SchemaRequest](./vertexai.schemarequest.md#schemarequest_interface) | Output response schema of the generated candidate text. This value can be a class generated with a [Schema](./vertexai.schema.md#schema_class) static method like <code>Schema.string()</code> or <code>Schema.object()</code> or it can be a plain JS object matching the [SchemaRequest](./vertexai.schemarequest.md#schemarequest_interface) interface. <br/>Note: This only applies when the specified <code>responseMIMEType</code> supports a schema; currently this is limited to <code>application/json</code> and <code>text/x.enum</code>. |
32+
=======
33+
| [responseModalities](./vertexai.generationconfig.md#generationconfigresponsemodalities) | [ResponseModality](./vertexai.md#responsemodality)<!-- -->\[\] | <b><i>(Public Preview)</i></b> Generation modalities to be returned in generation responses. |
34+
| [responseSchema](./vertexai.generationconfig.md#generationconfigresponseschema) | [TypedSchema](./vertexai.md#typedschema) \| [SchemaRequest](./vertexai.schemarequest.md#schemarequest_interface) | Output response schema of the generated candidate text. This value can be a class generated with a <code>[Schema](./vertexai.schema.md#schema_class)</code> static method like <code>Schema.string()</code> or <code>Schema.object()</code> or it can be a plain JS object matching the <code>[SchemaRequest](./vertexai.schemarequest.md#schemarequest_interface)</code> interface. <br/>Note: This only applies when the specified <code>responseMIMEType</code> supports a schema; currently this is limited to <code>application/json</code> and <code>text/x.enum</code>. |
35+
>>>>>>> 4f7f1ecb1 (feat(vertexai): Gemini multimodal output)
3136
| [stopSequences](./vertexai.generationconfig.md#generationconfigstopsequences) | string\[\] | |
3237
| [temperature](./vertexai.generationconfig.md#generationconfigtemperature) | number | |
3338
| [topK](./vertexai.generationconfig.md#generationconfigtopk) | number | |
@@ -75,6 +80,21 @@ Output response MIME type of the generated candidate text. Supported MIME types
7580
responseMimeType?: string;
7681
```
7782

83+
## GenerationConfig.responseModalities
84+
85+
> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment.
86+
>
87+
88+
Generation modalities to be returned in generation responses.
89+
90+
- Multimodal response generation is only supported in `gemini-2.0-flash-exp`<!-- -->, not `gemini-2.0-flash`<!-- -->. - Only image generation (`ResponseModality.IMAGE`<!-- -->) is supported.
91+
92+
<b>Signature:</b>
93+
94+
```typescript
95+
responseModalities?: ResponseModality[];
96+
```
97+
7898
## GenerationConfig.responseSchema
7999

80100
Output response schema of the generated candidate text. This value can be a class generated with a [Schema](./vertexai.schema.md#schema_class) static method like `Schema.string()` or `Schema.object()` or it can be a plain JS object matching the [SchemaRequest](./vertexai.schemarequest.md#schemarequest_interface) interface. <br/>Note: This only applies when the specified `responseMIMEType` supports a schema; currently this is limited to `application/json` and `text/x.enum`<!-- -->.

docs-devsite/vertexai.md

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,12 +125,14 @@ The Vertex AI in Firebase Web SDK.
125125
| Variable | Description |
126126
| --- | --- |
127127
| [POSSIBLE\_ROLES](./vertexai.md#possible_roles) | Possible roles. |
128+
| [ResponseModality](./vertexai.md#responsemodality) | <b><i>(Public Preview)</i></b> Generation modalities to be returned in generation responses. |
128129

129130
## Type Aliases
130131

131132
| Type Alias | Description |
132133
| --- | --- |
133134
| [Part](./vertexai.md#part) | Content part - includes text, image/video, or function call/response part types. |
135+
| [ResponseModality](./vertexai.md#responsemodality) | <b><i>(Public Preview)</i></b> Generation modalities to be returned in generation responses. |
134136
| [Role](./vertexai.md#role) | Role is the producer of the content. |
135137
| [Tool](./vertexai.md#tool) | Defines a tool that model can call to access external knowledge. |
136138
| [TypedSchema](./vertexai.md#typedschema) | A type that includes all specific Schema types. |
@@ -223,6 +225,22 @@ Possible roles.
223225
POSSIBLE_ROLES: readonly ["user", "model", "function", "system"]
224226
```
225227

228+
## ResponseModality
229+
230+
> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment.
231+
>
232+
233+
Generation modalities to be returned in generation responses.
234+
235+
<b>Signature:</b>
236+
237+
```typescript
238+
ResponseModality: {
239+
readonly TEXT: "TEXT";
240+
readonly IMAGE: "IMAGE";
241+
}
242+
```
243+
226244
## Part
227245

228246
Content part - includes text, image/video, or function call/response part types.
@@ -233,6 +251,19 @@ Content part - includes text, image/video, or function call/response part types.
233251
export type Part = TextPart | InlineDataPart | FunctionCallPart | FunctionResponsePart | FileDataPart;
234252
```
235253

254+
## ResponseModality
255+
256+
> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment.
257+
>
258+
259+
Generation modalities to be returned in generation responses.
260+
261+
<b>Signature:</b>
262+
263+
```typescript
264+
export type ResponseModality = (typeof ResponseModality)[keyof typeof ResponseModality];
265+
```
266+
236267
## Role
237268

238269
Role is the producer of the content.

packages/vertexai/src/requests/response-helpers.test.ts

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import {
2929
FinishReason,
3030
GenerateContentResponse,
3131
ImagenGCSImage,
32+
InlineDataPart,
3233
ImagenInlineImage
3334
} from '../types';
3435
import { getMockResponse } from '../../test-utils/mock-response';
@@ -132,6 +133,44 @@ const fakeResponseMixed3: GenerateContentResponse = {
132133
]
133134
};
134135

136+
const inlineDataPart1: InlineDataPart = {
137+
inlineData: {
138+
mimeType: 'image/png',
139+
data: 'base64encoded...'
140+
}
141+
};
142+
143+
const inlineDataPart2: InlineDataPart = {
144+
inlineData: {
145+
mimeType: 'image/jpeg',
146+
data: 'anotherbase64...'
147+
}
148+
};
149+
150+
const fakeResponseInlineData: GenerateContentResponse = {
151+
candidates: [
152+
{
153+
index: 0,
154+
content: {
155+
role: 'model',
156+
parts: [inlineDataPart1, inlineDataPart2]
157+
}
158+
}
159+
]
160+
};
161+
162+
const fakeResponseTextAndInlineData: GenerateContentResponse = {
163+
candidates: [
164+
{
165+
index: 0,
166+
content: {
167+
role: 'model',
168+
parts: [{ text: 'Describe this:' }, inlineDataPart1]
169+
}
170+
}
171+
]
172+
};
173+
135174
const badFakeResponse: GenerateContentResponse = {
136175
promptFeedback: {
137176
blockReason: BlockReason.SAFETY,
@@ -148,13 +187,15 @@ describe('response-helpers methods', () => {
148187
const enhancedResponse = addHelpers(fakeResponseText);
149188
expect(enhancedResponse.text()).to.equal('Some text and some more text');
150189
expect(enhancedResponse.functionCalls()).to.be.undefined;
190+
expect(enhancedResponse.inlineDataParts()).to.be.undefined;
151191
});
152192
it('good response functionCall', async () => {
153193
const enhancedResponse = addHelpers(fakeResponseFunctionCall);
154194
expect(enhancedResponse.text()).to.equal('');
155195
expect(enhancedResponse.functionCalls()).to.deep.equal([
156196
functionCallPart1.functionCall
157197
]);
198+
expect(enhancedResponse.inlineDataParts()).to.be.undefined;
158199
});
159200
it('good response functionCalls', async () => {
160201
const enhancedResponse = addHelpers(fakeResponseFunctionCalls);
@@ -163,31 +204,54 @@ describe('response-helpers methods', () => {
163204
functionCallPart1.functionCall,
164205
functionCallPart2.functionCall
165206
]);
207+
expect(enhancedResponse.inlineDataParts()).to.be.undefined;
166208
});
167209
it('good response text/functionCall', async () => {
168210
const enhancedResponse = addHelpers(fakeResponseMixed1);
169211
expect(enhancedResponse.functionCalls()).to.deep.equal([
170212
functionCallPart2.functionCall
171213
]);
172214
expect(enhancedResponse.text()).to.equal('some text');
215+
expect(enhancedResponse.inlineDataParts()).to.be.undefined;
173216
});
174217
it('good response functionCall/text', async () => {
175218
const enhancedResponse = addHelpers(fakeResponseMixed2);
176219
expect(enhancedResponse.functionCalls()).to.deep.equal([
177220
functionCallPart1.functionCall
178221
]);
179222
expect(enhancedResponse.text()).to.equal('some text');
223+
expect(enhancedResponse.inlineDataParts()).to.be.undefined;
180224
});
181225
it('good response text/functionCall/text', async () => {
182226
const enhancedResponse = addHelpers(fakeResponseMixed3);
183227
expect(enhancedResponse.functionCalls()).to.deep.equal([
184228
functionCallPart1.functionCall
185229
]);
186230
expect(enhancedResponse.text()).to.equal('some text and more text');
231+
expect(enhancedResponse.inlineDataParts()).to.be.undefined;
187232
});
188233
it('bad response safety', async () => {
189234
const enhancedResponse = addHelpers(badFakeResponse);
190235
expect(enhancedResponse.text).to.throw('SAFETY');
236+
expect(enhancedResponse.functionCalls).to.throw('SAFETY');
237+
expect(enhancedResponse.inlineDataParts).to.throw('SAFETY');
238+
});
239+
it('good response inlineData', async () => {
240+
const enhancedResponse = addHelpers(fakeResponseInlineData);
241+
expect(enhancedResponse.text()).to.equal('');
242+
expect(enhancedResponse.functionCalls()).to.be.undefined;
243+
expect(enhancedResponse.inlineDataParts()).to.deep.equal([
244+
inlineDataPart1,
245+
inlineDataPart2
246+
]);
247+
});
248+
it('good response text/inlineData', async () => {
249+
const enhancedResponse = addHelpers(fakeResponseTextAndInlineData);
250+
expect(enhancedResponse.text()).to.equal('Describe this:');
251+
expect(enhancedResponse.functionCalls()).to.be.undefined;
252+
expect(enhancedResponse.inlineDataParts()).to.deep.equal([
253+
inlineDataPart1
254+
]);
191255
});
192256
});
193257
describe('getBlockString', () => {

packages/vertexai/src/requests/response-helpers.ts

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import {
2323
GenerateContentResponse,
2424
ImagenGCSImage,
2525
ImagenInlineImage,
26+
InlineDataPart,
2627
VertexAIErrorCode
2728
} from '../types';
2829
import { VertexAIError } from '../errors';
@@ -89,6 +90,38 @@ export function addHelpers(
8990
}
9091
return '';
9192
};
93+
(response as EnhancedGenerateContentResponse).inlineDataParts = (): InlineDataPart[] | undefined => {
94+
if (response.candidates && response.candidates.length > 0) {
95+
if (response.candidates.length > 1) {
96+
logger.warn(
97+
`This response had ${response.candidates.length} ` +
98+
`candidates. Returning data from the first candidate only. ` +
99+
`Access response.candidates directly to use the other candidates.`
100+
);
101+
}
102+
if (hadBadFinishReason(response.candidates[0])) {
103+
throw new VertexAIError(
104+
VertexAIErrorCode.RESPONSE_ERROR,
105+
`Response error: ${formatBlockErrorMessage(
106+
response
107+
)}. Response body stored in error.response`,
108+
{
109+
response
110+
}
111+
);
112+
}
113+
return getInlineDataParts(response);
114+
} else if (response.promptFeedback) {
115+
throw new VertexAIError(
116+
VertexAIErrorCode.RESPONSE_ERROR,
117+
`Data not available. ${formatBlockErrorMessage(response)}`,
118+
{
119+
response
120+
}
121+
);
122+
}
123+
return undefined;
124+
};
92125
(response as EnhancedGenerateContentResponse).functionCalls = () => {
93126
if (response.candidates && response.candidates.length > 0) {
94127
if (response.candidates.length > 1) {
@@ -164,6 +197,31 @@ export function getFunctionCalls(
164197
}
165198
}
166199

200+
/**
201+
* Returns {@link InlineDataPart}s in the first candidate if present.
202+
*
203+
* @internal
204+
*/
205+
export function getInlineDataParts(
206+
response: GenerateContentResponse
207+
): InlineDataPart[] | undefined {
208+
const data: InlineDataPart[] = [];
209+
210+
if (response.candidates?.[0].content?.parts) {
211+
for (const part of response.candidates?.[0].content?.parts) {
212+
if (part.inlineData) {
213+
data.push(part);
214+
}
215+
}
216+
}
217+
218+
if (data.length > 0) {
219+
return data;
220+
} else {
221+
return undefined;
222+
}
223+
}
224+
167225
const badFinishReasons = [FinishReason.RECITATION, FinishReason.SAFETY];
168226

169227
function hadBadFinishReason(candidate: GenerateContentCandidate): boolean {

packages/vertexai/src/types/enums.ts

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,3 +240,29 @@ export enum Modality {
240240
*/
241241
DOCUMENT = 'DOCUMENT'
242242
}
243+
244+
/**
245+
* Generation modalities to be returned in generation responses.
246+
*
247+
* @beta
248+
*/
249+
export const ResponseModality = {
250+
/**
251+
* Text.
252+
* @beta
253+
*/
254+
TEXT: 'TEXT',
255+
/**
256+
* Image.
257+
* @beta
258+
*/
259+
IMAGE: 'IMAGE'
260+
} as const;
261+
262+
/**
263+
* Generation modalities to be returned in generation responses.
264+
*
265+
* @beta
266+
*/
267+
export type ResponseModality =
268+
(typeof ResponseModality)[keyof typeof ResponseModality];

0 commit comments

Comments
 (0)