Skip to content

Commit ed1b316

Browse files
committed
Add voice conversion method
1 parent cf7cd42 commit ed1b316

File tree

4 files changed

+81
-6
lines changed

4 files changed

+81
-6
lines changed

.pre-commit-config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ repos:
44
rev: v0.4.4
55
hooks:
66
- id: ruff
7+
args: ['--preview']
78

89
- repo: https://github.com/pre-commit/pre-commit-hooks
910
rev: v4.4.0

demo/convert.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import argparse
2+
import asyncio
3+
from lmnt.api import Speech
4+
5+
6+
async def main(args):
7+
async with Speech() as s:
8+
with open(args.audio, 'rb') as f:
9+
audio = f.read()
10+
11+
converted_audio = await s.convert(audio=audio, voice=args.voice)
12+
with open('output.mp3', 'wb') as f:
13+
f.write(converted_audio)
14+
print('Done.')
15+
16+
17+
if __name__ == '__main__':
18+
parser = argparse.ArgumentParser(description='Convert speech to a different voice')
19+
parser.add_argument('-a', '--audio', required=True, help='Filename of audio to convert')
20+
parser.add_argument('-v', '--voice', required=True, help='Voice to use')
21+
args = parser.parse_args()
22+
asyncio.run(main(args))

pyproject.toml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@ build-backend = "setuptools.build_meta"
77
[tool.ruff]
88
cache-dir = "~/.cache/ruff"
99
line-length = 160
10+
preview = true
11+
12+
[tool.ruff.lint]
1013
select = [
1114
"E4", "E7", "E9",
1215
"F",
@@ -16,12 +19,12 @@ select = [
1619
]
1720
ignore = ["I001"]
1821

19-
[tool.ruff.flake8-quotes]
22+
[tool.ruff.lint.flake8-quotes]
2023
docstring-quotes = "double"
2124
inline-quotes = "single"
2225
multiline-quotes = "double"
2326

24-
[tool.ruff.per-file-ignores]
27+
[tool.ruff.lint.per-file-ignores]
2528
# F401 = unused import; this warning doesn't make sense in __init__.py files
2629
"__init__.py" = ["F401"]
2730

src/lmnt/api.py

Lines changed: 53 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
_VOICE_ENDPOINT = '/v1/ai/voice/{id}'
2929
_CREATE_VOICE_ENDPOINT = '/v1/ai/voice'
3030
_SPEECH_ENDPOINT = '/v1/ai/speech'
31+
_CONVERT_ENDPOINT = '/v1/ai/speech/convert'
3132
_ACCOUNT_ENDPOINT = '/v1/account'
3233

3334

@@ -158,6 +159,7 @@ async def list_voices(self, starred: bool = False, owner: str = 'all'):
158159
if not isinstance(starred, bool):
159160
raise ValueError(f'Invalid starred: {starred}')
160161
self._lazy_init()
162+
assert self._session is not None, 'Session was not initialized'
161163
url = f'{self._base_url}{_LIST_VOICES_ENDPOINT}?starred={starred}&owner={owner}'
162164

163165
async with self._session.get(url, headers=self._build_headers()) as resp:
@@ -174,13 +176,14 @@ async def voice_info(self, voice_id: str):
174176
Returns a dictionary containing details of the voice.
175177
"""
176178
self._lazy_init()
179+
assert self._session is not None, 'Session was not initialized'
177180
url = f'{self._base_url}{_VOICE_ENDPOINT}'.format(id=voice_id)
178181

179182
async with self._session.get(url, headers=self._build_headers()) as resp:
180183
await self._handle_response_errors(resp, 'Speech.voice_info')
181184
return await resp.json()
182185

183-
async def create_voice(self, name: str, enhance: bool, filenames: List[str], type: str = 'instant', gender: str = None, description: str = None):
186+
async def create_voice(self, name: str, enhance: bool, filenames: List[str], type: str = 'instant', gender: Optional[str] = None, description: Optional[str] = None):
184187
"""
185188
Creates a new voice from a set of audio files. Returns the voice metadata object.
186189
@@ -217,6 +220,7 @@ async def create_voice(self, name: str, enhance: bool, filenames: List[str], typ
217220
raise ValueError('[Speech.create_voice] Enhance must not be None.')
218221

219222
self._lazy_init()
223+
assert self._session is not None, 'Session was not initialized'
220224

221225
metadata = json.dumps({
222226
'name': name,
@@ -256,6 +260,7 @@ async def update_voice(self, voice_id: str, **kwargs):
256260
- `description` (str): A description of the voice.
257261
"""
258262
self._lazy_init()
263+
assert self._session is not None, 'Session was not initialized'
259264
url = f'{self._base_url}{_VOICE_ENDPOINT}'.format(id=voice_id)
260265

261266
data = {
@@ -298,6 +303,7 @@ async def delete_voice(self, voice_id: str):
298303
- `voice_id` (str): The id of the voice to delete. If you don't know the id, you can get it from `list_voices()`.
299304
"""
300305
self._lazy_init()
306+
assert self._session is not None, 'Session was not initialized'
301307
url = f'{self._base_url}{_VOICE_ENDPOINT}'.format(id=voice_id)
302308

303309
async with self._session.delete(url, headers=self._build_headers()) as resp:
@@ -343,6 +349,7 @@ async def synthesize(self, text: str, voice: str, **kwargs):
343349
assert len(voice) > 0, '[Speech.synthesize] `voice` must be non-empty.'
344350

345351
self._lazy_init()
352+
assert self._session is not None, 'Session was not initialized'
346353
url = f'{self._base_url}{_SPEECH_ENDPOINT}'
347354

348355
model = kwargs.get('model', 'aurora')
@@ -384,13 +391,53 @@ async def synthesize(self, text: str, voice: str, **kwargs):
384391
synthesis_result['seed'] = response_data['seed']
385392
return synthesis_result
386393

394+
async def convert(self, audio: bytes, voice: str, **kwargs) -> bytes:
395+
"""
396+
Converts speech from one voice to another.
397+
398+
Required parameters:
399+
- `audio` (bytes): The audio file to be converted into a new voice. Max file size: 1 MB.
400+
- `voice` (str): The voice id to convert the speech into. Voice ids can be retrieved from `list_voices()` or `voice_info()`.
401+
402+
Optional parameters:
403+
- `format` (str): The audio format to use for conversion. Defaults to `mp3`.
404+
- `sample_rate` (int): 8000, 16000, or 24000 - the desired output sample rate. Defaults to 24000.
405+
- `language` (str): The language of the source audio. Two letter ISO 639-1 code. Defaults to `en`.
406+
407+
Returns:
408+
- bytes: The binary audio data of the converted speech.
409+
"""
410+
assert audio is not None, '[Speech.convert] `audio` must not be None.'
411+
assert len(audio) > 0, '[Speech.convert] `audio` must be non-empty.'
412+
assert voice is not None, '[Speech.convert] `voice` must not be None.'
413+
assert len(voice) > 0, '[Speech.convert] `voice` must be non-empty.'
414+
415+
self._lazy_init()
416+
assert self._session is not None, 'Session was not initialized'
417+
url = f'{self._base_url}{_CONVERT_ENDPOINT}'
418+
419+
form_data = aiohttp.FormData()
420+
form_data.add_field('audio', audio)
421+
form_data.add_field('voice', voice)
422+
423+
if 'format' in kwargs:
424+
form_data.add_field('format', kwargs['format'])
425+
if 'sample_rate' in kwargs:
426+
form_data.add_field('sample_rate', kwargs['sample_rate'])
427+
if 'language' in kwargs:
428+
form_data.add_field('language', kwargs['language'])
429+
430+
async with self._session.post(url, data=form_data, headers=self._build_headers()) as resp:
431+
await self._handle_response_errors(resp, 'Speech.convert')
432+
return await resp.read()
433+
387434
async def synthesize_streaming(self, voice: str, return_extras: bool = False, **kwargs):
388435
"""
389436
Initiates a full-duplex streaming connection with the server that allows you to send text and receive audio in real-time.
390437
391438
Parameters:
392-
- `format` (str): `mp3`, `raw`, or `ulaw` the desired output format. Defaults to `mp3`.
393-
- `sample_rate` (int): 8000, 16000, or 24000 – the desired output sample rate. Defaults to 24000.
439+
- `format` (str): `mp3`, `raw`, or `ulaw` - the desired output format. Defaults to `mp3`.
440+
- `sample_rate` (int): 8000, 16000, or 24000 - the desired output sample rate. Defaults to 24000.
394441
- `voice` (str): The voice id to use for this connection.
395442
- `speed` (float): The speed to use for synthesis. Defaults to 1.0.
396443
- `return_extras` (bool): If `True`, the response will include word durations detail. Defaults to `False`.
@@ -404,6 +451,7 @@ async def synthesize_streaming(self, voice: str, return_extras: bool = False, **
404451
raise ValueError('[Speech.synthesize_streaming] `voice` must not be None.')
405452

406453
self._lazy_init()
454+
assert self._session is not None, 'Session was not initialized'
407455

408456
init_msg = {
409457
'X-API-Key': self._api_key,
@@ -431,6 +479,7 @@ async def account_info(self):
431479
Returns details about your account.
432480
"""
433481
self._lazy_init()
482+
assert self._session is not None, 'Session was not initialized'
434483
url = f'{self._base_url}{_ACCOUNT_ENDPOINT}'
435484

436485
async with self._session.get(url, headers=self._build_headers()) as resp:
@@ -441,7 +490,7 @@ def _lazy_init(self):
441490
if self._session is None:
442491
self._session = aiohttp.ClientSession(connector=self._connector)
443492

444-
def _build_headers(self, type: str = None):
493+
def _build_headers(self, type: Optional[str] = None):
445494
headers = {'X-API-Key': self._api_key}
446495
if type is not None:
447496
headers['Content-Type'] = type

0 commit comments

Comments
 (0)