Add voice conversion method

kaikato · kaikato · commit ed1b316daf28 · 2025-02-26T15:03:10.000-08:00
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -4,6 +4,7 @@ repos:
     rev: v0.4.4
     hooks:
       - id: ruff
+        args: ['--preview']
 
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v4.4.0
diff --git a/demo/convert.py b/demo/convert.py
@@ -0,0 +1,22 @@
+import argparse
+import asyncio
+from lmnt.api import Speech
+
+
+async def main(args):
+  async with Speech() as s:
+    with open(args.audio, 'rb') as f:
+      audio = f.read()
+
+    converted_audio = await s.convert(audio=audio, voice=args.voice)
+    with open('output.mp3', 'wb') as f:
+      f.write(converted_audio)
+    print('Done.')
+
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser(description='Convert speech to a different voice')
+  parser.add_argument('-a', '--audio', required=True, help='Filename of audio to convert')
+  parser.add_argument('-v', '--voice', required=True, help='Voice to use')
+  args = parser.parse_args()
+  asyncio.run(main(args))
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,6 +7,9 @@ build-backend = "setuptools.build_meta"
 [tool.ruff]
 cache-dir = "~/.cache/ruff"
 line-length = 160
+preview = true
+
+[tool.ruff.lint]
 select = [
     "E4", "E7", "E9",
     "F",
@@ -16,12 +19,12 @@ select = [
 ]
 ignore = ["I001"]
 
-[tool.ruff.flake8-quotes]
+[tool.ruff.lint.flake8-quotes]
 docstring-quotes = "double"
 inline-quotes = "single"
 multiline-quotes = "double"
 
-[tool.ruff.per-file-ignores]
+[tool.ruff.lint.per-file-ignores]
 # F401 = unused import; this warning doesn't make sense in __init__.py files
 "__init__.py" = ["F401"]
 
diff --git a/src/lmnt/api.py b/src/lmnt/api.py
@@ -28,6 +28,7 @@
 _VOICE_ENDPOINT = '/v1/ai/voice/{id}'
 _CREATE_VOICE_ENDPOINT = '/v1/ai/voice'
 _SPEECH_ENDPOINT = '/v1/ai/speech'
+_CONVERT_ENDPOINT = '/v1/ai/speech/convert'
 _ACCOUNT_ENDPOINT = '/v1/account'
 
 
@@ -158,6 +159,7 @@ async def list_voices(self, starred: bool = False, owner: str = 'all'):
     if not isinstance(starred, bool):
       raise ValueError(f'Invalid starred: {starred}')
     self._lazy_init()
+    assert self._session is not None, 'Session was not initialized'
     url = f'{self._base_url}{_LIST_VOICES_ENDPOINT}?starred={starred}&owner={owner}'
 
     async with self._session.get(url, headers=self._build_headers()) as resp:
@@ -174,13 +176,14 @@ async def voice_info(self, voice_id: str):
     Returns a dictionary containing details of the voice.
     """
     self._lazy_init()
+    assert self._session is not None, 'Session was not initialized'
     url = f'{self._base_url}{_VOICE_ENDPOINT}'.format(id=voice_id)
 
     async with self._session.get(url, headers=self._build_headers()) as resp:
       await self._handle_response_errors(resp, 'Speech.voice_info')
       return await resp.json()
 
-  async def create_voice(self, name: str, enhance: bool, filenames: List[str], type: str = 'instant', gender: str = None, description: str = None):
+  async def create_voice(self, name: str, enhance: bool, filenames: List[str], type: str = 'instant', gender: Optional[str] = None, description: Optional[str] = None):
     """
     Creates a new voice from a set of audio files. Returns the voice metadata object.
 
@@ -217,6 +220,7 @@ async def create_voice(self, name: str, enhance: bool, filenames: List[str], typ
       raise ValueError('[Speech.create_voice] Enhance must not be None.')
 
     self._lazy_init()
+    assert self._session is not None, 'Session was not initialized'
 
     metadata = json.dumps({
         'name': name,
@@ -256,6 +260,7 @@ async def update_voice(self, voice_id: str, **kwargs):
     - `description` (str): A description of the voice.
     """
     self._lazy_init()
+    assert self._session is not None, 'Session was not initialized'
     url = f'{self._base_url}{_VOICE_ENDPOINT}'.format(id=voice_id)
 
     data = {
@@ -298,6 +303,7 @@ async def delete_voice(self, voice_id: str):
     - `voice_id` (str): The id of the voice to delete. If you don't know the id, you can get it from `list_voices()`.
     """
     self._lazy_init()
+    assert self._session is not None, 'Session was not initialized'
     url = f'{self._base_url}{_VOICE_ENDPOINT}'.format(id=voice_id)
 
     async with self._session.delete(url, headers=self._build_headers()) as resp:
@@ -343,6 +349,7 @@ async def synthesize(self, text: str, voice: str, **kwargs):
     assert len(voice) > 0, '[Speech.synthesize] `voice` must be non-empty.'
 
     self._lazy_init()
+    assert self._session is not None, 'Session was not initialized'
     url = f'{self._base_url}{_SPEECH_ENDPOINT}'
 
     model = kwargs.get('model', 'aurora')
@@ -384,13 +391,53 @@ async def synthesize(self, text: str, voice: str, **kwargs):
         synthesis_result['seed'] = response_data['seed']
       return synthesis_result
 
+  async def convert(self, audio: bytes, voice: str, **kwargs) -> bytes:
+    """
+    Converts speech from one voice to another.
+
+    Required parameters:
+    - `audio` (bytes): The audio file to be converted into a new voice. Max file size: 1 MB.
+    - `voice` (str): The voice id to convert the speech into. Voice ids can be retrieved from `list_voices()` or `voice_info()`.
+
+    Optional parameters:
+    - `format` (str): The audio format to use for conversion. Defaults to `mp3`.
+    - `sample_rate` (int): 8000, 16000, or 24000 - the desired output sample rate. Defaults to 24000.
+    - `language` (str): The language of the source audio. Two letter ISO 639-1 code. Defaults to `en`.
+
+    Returns:
+    - bytes: The binary audio data of the converted speech.
+    """
+    assert audio is not None, '[Speech.convert] `audio` must not be None.'
+    assert len(audio) > 0, '[Speech.convert] `audio` must be non-empty.'
+    assert voice is not None, '[Speech.convert] `voice` must not be None.'
+    assert len(voice) > 0, '[Speech.convert] `voice` must be non-empty.'
+
+    self._lazy_init()
+    assert self._session is not None, 'Session was not initialized'
+    url = f'{self._base_url}{_CONVERT_ENDPOINT}'
+
+    form_data = aiohttp.FormData()
+    form_data.add_field('audio', audio)
+    form_data.add_field('voice', voice)
+
+    if 'format' in kwargs:
+      form_data.add_field('format', kwargs['format'])
+    if 'sample_rate' in kwargs:
+      form_data.add_field('sample_rate', kwargs['sample_rate'])
+    if 'language' in kwargs:
+      form_data.add_field('language', kwargs['language'])
+
+    async with self._session.post(url, data=form_data, headers=self._build_headers()) as resp:
+      await self._handle_response_errors(resp, 'Speech.convert')
+      return await resp.read()
+
   async def synthesize_streaming(self, voice: str, return_extras: bool = False, **kwargs):
     """
     Initiates a full-duplex streaming connection with the server that allows you to send text and receive audio in real-time.
 
     Parameters:
-    - `format` (str): `mp3`, `raw`, or `ulaw` – the desired output format. Defaults to `mp3`.
-    - `sample_rate` (int): 8000, 16000, or 24000 – the desired output sample rate. Defaults to 24000.
+    - `format` (str): `mp3`, `raw`, or `ulaw` - the desired output format. Defaults to `mp3`.
+    - `sample_rate` (int): 8000, 16000, or 24000 - the desired output sample rate. Defaults to 24000.
     - `voice` (str): The voice id to use for this connection.
     - `speed` (float): The speed to use for synthesis. Defaults to 1.0.
     - `return_extras` (bool): If `True`, the response will include word durations detail. Defaults to `False`.
@@ -404,6 +451,7 @@ async def synthesize_streaming(self, voice: str, return_extras: bool = False, **
       raise ValueError('[Speech.synthesize_streaming] `voice` must not be None.')
 
     self._lazy_init()
+    assert self._session is not None, 'Session was not initialized'
 
     init_msg = {
         'X-API-Key': self._api_key,
@@ -431,6 +479,7 @@ async def account_info(self):
     Returns details about your account.
     """
     self._lazy_init()
+    assert self._session is not None, 'Session was not initialized'
     url = f'{self._base_url}{_ACCOUNT_ENDPOINT}'
 
     async with self._session.get(url, headers=self._build_headers()) as resp:
@@ -441,7 +490,7 @@ def _lazy_init(self):
     if self._session is None:
       self._session = aiohttp.ClientSession(connector=self._connector)
 
-  def _build_headers(self, type: str = None):
+  def _build_headers(self, type: Optional[str] = None):
     headers = {'X-API-Key': self._api_key}
     if type is not None:
       headers['Content-Type'] = type