Skip to content

Commit ddc8ac2

Browse files
authored
Openai demo trigger conv (#1288)
* wip: trigger conversation with dc call in openai demo. * Now triggering the openai demo to automatically start talking.
1 parent 1d7339b commit ddc8ac2

File tree

5 files changed

+141
-23
lines changed

5 files changed

+141
-23
lines changed

examples/WebRTCExamples/WebRTCOpenAI/Program.cs

Lines changed: 79 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
// History:
3939
// 19 Dec 2024 Aaron Clauson Created, Dublin, Ireland.
4040
// 28 Dec 2024 Aaron Clauson Switched to functional approach for The Craic.
41+
// 17 Jan 2025 Aaron Clauson Added create resposne data channel message to trigger conversation start.
4142
//
4243
// License:
4344
// BSD 3-Clause "New" or "Revised" License, see included LICENSE.md file.
@@ -67,17 +68,30 @@ record Problem(string detail);
6768

6869
record PcContext(
6970
RTCPeerConnection Pc,
71+
SemaphoreSlim PcConnectedSemaphore,
7072
string EphemeralKey = "",
7173
string OfferSdp = "",
7274
string AnswerSdp = ""
7375
);
7476

77+
enum VoicesEnum
78+
{
79+
alloy,
80+
ash,
81+
ballad,
82+
coral,
83+
echo,
84+
sage,
85+
shimmer,
86+
verse
87+
}
88+
7589
class Program
7690
{
7791
private const string OPENAI_REALTIME_SESSIONS_URL = "https://api.openai.com/v1/realtime/sessions";
7892
private const string OPENAI_REALTIME_BASE_URL = "https://api.openai.com/v1/realtime";
7993
private const string OPENAI_MODEL = "gpt-4o-realtime-preview-2024-12-17";
80-
private const string OPENAI_VERSE = "shimmer"; // Supported values are: 'alloy', 'ash', 'ballad', 'coral', 'echo', 'sage', 'shimmer', and 'verse'.
94+
private const VoicesEnum OPENAI_VERSE = VoicesEnum.shimmer;
8195
private const string OPENAI_DATACHANNEL_NAME = "oai-events";
8296

8397
private static Microsoft.Extensions.Logging.ILogger logger = NullLogger.Instance;
@@ -101,26 +115,27 @@ static async Task Main(string[] args)
101115
.BindAsync(_ =>
102116
{
103117
logger.LogInformation("STEP 1: Get ephemeral key from OpenAI.");
104-
return CreateEphemeralKeyAsync(OPENAI_REALTIME_SESSIONS_URL, args[0], OPENAI_MODEL, OPENAI_VERSE);
118+
return CreateEphemeralKeyAsync(OPENAI_REALTIME_SESSIONS_URL, args[0], OPENAI_MODEL, OPENAI_VERSE.ToString());
105119
})
106120
.BindAsync(async ephemeralKey =>
107121
{
108122
logger.LogDebug("STEP 2: Create WebRTC PeerConnection & get local SDP offer.");
109123

110-
var pc = await CreatePeerConnection();
124+
var onConnectedSemaphore = new SemaphoreSlim(0, 1);
125+
var pc = await CreatePeerConnection(onConnectedSemaphore);
111126
var offer = pc.createOffer();
112127
await pc.setLocalDescription(offer);
113128

114129
logger.LogDebug("SDP offer:");
115130
logger.LogDebug(offer.sdp);
116131

117132
return Prelude.Right<Problem, PcContext>(
118-
new PcContext(pc, ephemeralKey, offer.sdp, string.Empty)
133+
new PcContext(pc, onConnectedSemaphore, ephemeralKey, offer.sdp, string.Empty)
119134
);
120135
})
121136
.BindAsync(async ctx =>
122137
{
123-
logger.LogInformation("STEP 3: Send SDP offer to OpenAI REST server & get SDP answer.");
138+
logger.LogInformation("STEP 3: Send SDP offer to OpenAI REST server & get SDP answer.");
124139

125140
var answerEither = await GetOpenAIAnswerSdpAsync(ctx.EphemeralKey, ctx.OfferSdp);
126141
return answerEither.Map(answer => ctx with { AnswerSdp = answer });
@@ -141,9 +156,21 @@ static async Task Main(string[] args)
141156
Prelude.Right<Problem, PcContext>(ctx) :
142157
Prelude.Left<Problem, PcContext>(new Problem("Failed to set remote SDP."));
143158
})
159+
.MapAsync(async ctx =>
160+
{
161+
logger.LogInformation("STEP 5: Wait for data channel to connect and then trigger conversation.");
162+
163+
await ctx.PcConnectedSemaphore.WaitAsync();
164+
165+
// NOTE: If you want to trigger the convesation by using the audio from your microphone comment
166+
// out this line.
167+
SendResponseCreate(ctx.Pc.DataChannels.First(), VoicesEnum.alloy, "Introduce urself.");
168+
169+
return ctx;
170+
})
144171
.BindAsync(ctx =>
145172
{
146-
logger.LogInformation("STEP 5: Wait for ctrl-c to indicate user exit.");
173+
logger.LogInformation("STEP 6: Wait for ctrl-c to indicate user exit.");
147174

148175
ManualResetEvent exitMre = new(false);
149176
Console.CancelKeyPress += (_, e) =>
@@ -162,9 +189,35 @@ static async Task Main(string[] args)
162189
Left: prob => Console.WriteLine($"There was a problem setting up the connection. {prob.detail}"),
163190
Right: _ => Console.WriteLine("The call was successful.")
164191
);
165-
}
192+
}
193+
194+
/// <summary>
195+
/// Sends a response create message to the OpenAI data channel to trigger the conversation.
196+
/// </summary>
197+
private static void SendResponseCreate(RTCDataChannel dc, VoicesEnum voice, string message)
198+
{
199+
var responseCreate = new OpenAIResponseCreate
200+
{
201+
EventID = Guid.NewGuid().ToString(),
202+
Response = new OpenAIResponseCreateResponse
203+
{
204+
Instructions = message,
205+
Voice = voice.ToString()
206+
}
207+
};
208+
209+
logger.LogInformation($"Sending initial response create to first call data channel {dc.label}.");
210+
logger.LogDebug(responseCreate.ToJson());
211+
212+
dc.send(responseCreate.ToJson());
213+
}
166214

167-
private static async Task<RTCPeerConnection> CreatePeerConnection()
215+
/// <summary>
216+
/// Method to create the local peer connection instance and data channel.
217+
/// </summary>
218+
/// <param name="onConnectedSemaphore">A semaphore that will get set when the data channel on the peer connection is opened. Since the data channel
219+
/// can only be opened once the peer connection is open this indicates both are ready for use.</param>
220+
private static async Task<RTCPeerConnection> CreatePeerConnection(SemaphoreSlim onConnectedSemaphore)
168221
{
169222
var pcConfig = new RTCConfiguration
170223
{
@@ -224,6 +277,7 @@ private static async Task<RTCPeerConnection> CreatePeerConnection()
224277
dataChannel.onopen += () =>
225278
{
226279
logger.LogDebug("OpenAI data channel opened.");
280+
onConnectedSemaphore.Release();
227281
};
228282

229283
dataChannel.onclose += () => logger.LogDebug($"OpenAI data channel {dataChannel.label} closed.");
@@ -233,6 +287,9 @@ private static async Task<RTCPeerConnection> CreatePeerConnection()
233287
return peerConnection;
234288
}
235289

290+
/// <summary>
291+
/// Event handler for WebRTC data channel messages.
292+
/// </summary>
236293
private static void OnDataChannelMessage(RTCDataChannel dc, DataChannelPayloadProtocols protocol, byte[] data)
237294
{
238295
//logger.LogInformation($"Data channel {dc.label}, protocol {protocol} message length {data.Length}.");
@@ -265,6 +322,10 @@ private static void OnDataChannelMessage(RTCDataChannel dc, DataChannelPayloadPr
265322
}
266323
}
267324

325+
/// <summary>
326+
/// Completes the steps required to get an ephemeral key from the OpenAI REST server. The ephemeral key is needed
327+
/// to send an SDP offer, and get the SDP answer.
328+
/// </summary>
268329
private static async Task<Either<Problem, string>> CreateEphemeralKeyAsync(string sessionsUrl, string openAIToken, string model, string voice)
269330
=> (await SendHttpPostAsync(
270331
sessionsUrl,
@@ -284,13 +345,23 @@ private static async Task<Either<Problem, string>> CreateEphemeralKeyAsync(strin
284345
Prelude.Left<Problem, string>(new Problem("Failed to get ephemeral secret."))
285346
);
286347

348+
/// <summary>
349+
/// Attempts to get the SDP answer from the OpenAI REST server. This is the way OpenAI does the signalling. The
350+
/// ICE candidates will be returned in the SDP answer and are publicly accessible IP's.
351+
/// </summary>
352+
/// <remarks>
353+
/// See https://platform.openai.com/docs/guides/realtime-webrtc#creating-an-ephemeral-token.
354+
/// </remarks>
287355
private static Task<Either<Problem, string>> GetOpenAIAnswerSdpAsync(string ephemeralKey, string offerSdp)
288356
=> SendHttpPostAsync(
289357
$"{OPENAI_REALTIME_BASE_URL}?model={OPENAI_MODEL}",
290358
ephemeralKey,
291359
offerSdp,
292360
"application/sdp");
293361

362+
/// <summary>
363+
/// Helper method to send an HTTP psot request with the required headers.
364+
/// </summary>
294365
private static async Task<Either<Problem, string>> SendHttpPostAsync(
295366
string url,
296367
string token,
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
// https://platform.openai.com/docs/api-reference/realtime-client-events/response/create
2+
3+
using System.Text.Json;
4+
using System.Text.Json.Serialization;
5+
6+
namespace demo;
7+
8+
public class OpenAIResponseCreate : OpenAIServerEventBase
9+
{
10+
[JsonPropertyName("type")]
11+
public override string Type => "response.create";
12+
13+
[JsonPropertyName("response")]
14+
public required OpenAIResponseCreateResponse Response { get; set; }
15+
16+
public override string ToJson()
17+
{
18+
return JsonSerializer.Serialize(this, JsonOptions.Default);
19+
}
20+
}
21+
22+
public class OpenAIResponseCreateResponse
23+
{
24+
[JsonPropertyName("modalities")]
25+
public string[] Modalities { get; set; } = new[] { "audio", "text" };
26+
27+
[JsonPropertyName("instructions")]
28+
public string? Instructions { get; set; }
29+
30+
[JsonPropertyName("voice")]
31+
public string? Voice { get; set; }
32+
33+
[JsonPropertyName("output_audio_format")]
34+
public string OutputAudioFrmat { get; set; } = "pcm16";
35+
}

examples/WebRTCExamples/WebRTCOpenAI/RealtimeModels/OpenAIServerEventBase.cs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@ public class OpenAIServerEventBase
88
[JsonPropertyName("event_id")]
99
public string? EventID { get; set; }
1010

11-
public string? Type { get; set; }
11+
[JsonPropertyName("type")]
12+
public virtual string? Type { get; set; }
1213

1314
public virtual string ToJson()
1415
{

examples/WebRTCExamples/WebRTCOpenAI/WebRTCOpenAI.csproj

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
<PropertyGroup>
44
<OutputType>Exe</OutputType>
55
<TargetFramework>net8.0-windows10.0.17763.0</TargetFramework>
6-
<LangVersion>10.0</LangVersion>
6+
<LangVersion>12.0</LangVersion>
77
<Nullable>enable</Nullable>
88
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
99
</PropertyGroup>

examples/WebRTCExamples/WebRTCOpenAIAliceAndBob/Program.cs

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -192,20 +192,10 @@ from bobCtx in bobCallTask.Result
192192
// Trigger the conversation by getting Alice to say something witty.
193193
var aliceDataChannel = aliceConncectedCtx.Pc.DataChannels.Where(x => x.label == OPENAI_DATACHANNEL_NAME).Single();
194194

195-
var responseCreate = new OpenAIResponseCreate
195+
if (aliceDataChannel != null)
196196
{
197-
EventID = Guid.NewGuid().ToString(),
198-
Response = new OpenAIResponseCreateResponse
199-
{
200-
Instructions = "Only talk in cheesy puns. Keep it short once you'vegot you pun in. To start the conversation please repeat repeat this phrase in your corniest accent: 'You're a few tinnies short of a six-pack.'",
201-
Voice = VoicesEnum.shimmer.ToString()
202-
}
203-
};
204-
205-
logger.LogInformation($"Sending initial response create to first call data channel {aliceDataChannel?.label}.");
206-
logger.LogDebug(responseCreate.ToJson());
207-
208-
aliceDataChannel?.send(responseCreate.ToJson());
197+
SendResponseCreate(aliceDataChannel, VoicesEnum.shimmer, "Only talk in cheesy puns. Keep it short once you'vegot you pun in. To start the conversation please repeat repeat this phrase in your corniest accent: 'You're a few tinnies short of a six-pack.'");
198+
}
209199

210200
logger.LogInformation($"ctrl-c to exit..");
211201

@@ -229,6 +219,27 @@ from bobCtx in bobCallTask.Result
229219
_audioScopeForm?.Invoke(() => _audioScopeForm.Close());
230220
}
231221

222+
/// <summary>
223+
/// Sends a response create message to the OpenAI data channel to trigger the conversation.
224+
/// </summary>
225+
private static void SendResponseCreate(RTCDataChannel dc, VoicesEnum voice, string message)
226+
{
227+
var responseCreate = new OpenAIResponseCreate
228+
{
229+
EventID = Guid.NewGuid().ToString(),
230+
Response = new OpenAIResponseCreateResponse
231+
{
232+
Instructions = message,
233+
Voice = voice.ToString()
234+
}
235+
};
236+
237+
logger.LogInformation($"Sending initial response create to first call data channel {dc.label}.");
238+
logger.LogDebug(responseCreate.ToJson());
239+
240+
dc.send(responseCreate.ToJson());
241+
}
242+
232243
/// <summary>
233244
/// Initiaites the creation and media session wiring for a local peer connection.
234245
/// </summary>

0 commit comments

Comments
 (0)