Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions podcastfy/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def process_content(

if transcript_file:
logger.info(f"Using transcript file: {transcript_file}")
with open(transcript_file, "r") as file:
with open(transcript_file, "r", encoding="utf-8") as file:
qa_content = file.read()
else:
# Initialize content_extractor if needed
Expand Down Expand Up @@ -209,7 +209,7 @@ def main(
conversation_config = None
# Load conversation config if provided
if conversation_config_path:
with open(conversation_config_path, "r") as f:
with open(conversation_config_path, "r", encoding='utf-8') as f:
conversation_config: Dict[str, Any] | None = yaml.safe_load(f)

# Use default TTS model from conversation config if not specified
Expand Down Expand Up @@ -360,7 +360,7 @@ def generate_podcast(
else:
urls_list = urls or []
if url_file:
with open(url_file, "r") as file:
with open(url_file, "r", encoding="utf-8") as file:
urls_list.extend([line.strip() for line in file if line.strip()])

if not urls_list and not image_paths and not text and not topic:
Expand Down
12 changes: 10 additions & 2 deletions podcastfy/content_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import os
from typing import Optional, Dict, Any, List
import re
import unicodedata


from langchain_community.chat_models import ChatLiteLLM
Expand Down Expand Up @@ -833,6 +834,10 @@ def __compose_prompt(self, num_images: int, longform: bool=False):
composed_prompt_template = ChatPromptTemplate.from_messages(combined_messages)

return composed_prompt_template, image_path_keys

def __sanitize_unicode_text(self, text: str) -> str:
text = unicodedata.normalize('NFKC', text)
return text

def generate_qa_content(
self,
Expand Down Expand Up @@ -894,12 +899,15 @@ def generate_qa_content(
self.response,
self.content_generator_config
)


# Sanitize unicode response
self.response = self.__sanitize_unicode_text(self.response)

logger.info(f"Content generated successfully")

# Save output if requested
if output_filepath:
with open(output_filepath, "w") as file:
with open(output_filepath, "w", encoding="utf-8") as file:
file.write(self.response)
logger.info(f"Response content saved to {output_filepath}")
print(f"Transcript saved to {output_filepath}")
Expand Down
2 changes: 1 addition & 1 deletion podcastfy/content_parser/website_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def extract_content(self, url: str) -> str:
headers = {'User-Agent': self.user_agent}
response = requests.get(normalized_url, headers=headers, timeout=self.timeout)
response.raise_for_status() # Raise an exception for bad status codes

response.encoding = "utf-8"
# Parse the page content with BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

Expand Down
2 changes: 1 addition & 1 deletion podcastfy/content_parser/youtube_transcriber.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def main(seed: int = 42) -> None:

# Save transcript to file
output_file = 'tests/data/transcripts/youtube_transcript2.txt'
with open(output_file, 'w') as file:
with open(output_file, 'w', encoding="utf-8") as file:
file.write(transcript)

print(f"Transcript saved to {output_file}")
Expand Down
2 changes: 1 addition & 1 deletion podcastfy/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def __init__(self, config_file: str = 'config.yaml'):

config_path = get_config_path(config_file)
if config_path:
with open(config_path, 'r') as file:
with open(config_path, 'r', encoding='utf-8') as file:
self.config: Dict[str, Any] = yaml.safe_load(file)
else:
print("Could not locate config.yaml")
Expand Down
2 changes: 1 addition & 1 deletion podcastfy/utils/config_conversation.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ def _load_default_config(self) -> Dict[str, Any]:
"""Load the default configuration from conversation_config.yaml."""
config_path = get_conversation_config_path()
if config_path:
with open(config_path, 'r') as file:
with open(config_path, 'r', encoding='utf-8') as file:
return yaml.safe_load(file)
else:
raise FileNotFoundError("conversation_config.yaml not found")
Expand Down
2 changes: 1 addition & 1 deletion tests/data/mock/website.md
Original file line number Diff line number Diff line change
@@ -1 +1 @@
Tharsis Souza, PhD Tharsis Souza is a computer scientist passionate about data-driven products. He is Senior Vice President of Product Management, Modeling Engineering at Two Sigma Investments and Lecturer at Columbia University, Faculty member of the MSc. in Applied Analytics program. Prior to Two Sigma, he spent 10+ years delivering new technology products in a variety of companies from start-ups to Fortune 500’s in the U.S., Brazil, and the U.K. He’s an author of scholarly publications and a regular speaker in academic and business conferences. He also enjoys mentoring under-represented students & working professionals. Tharsis holds a Ph.D. in Computer Science from UCL, University of London following an M.Phil. and M.Sc. in Computer Science and a B.Sc. in Computer Engineering. Selected Interviews and Talks Mentorship Spotlight: Tharsis Souza, Two Sigma FactSet Investment Process Symposium - Innovative Data Panel BattleFin Alternative Data - Interview Beryl Elites - The Disruptors in Investment Management
Example Domain Example Domain This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission. More information...
12 changes: 6 additions & 6 deletions tests/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
- Quiz
- Conclusion
podcast_name: Teachfy
podcast_tagline: Learning Through Conversation
podcast_tagline: Learning Through Conversation - Witaj świecie! こんにちは世界
output_language: English
engagement_techniques:
- examples
Expand All @@ -57,7 +57,7 @@ def mock_files(tmp_path):
transcript_file.write_text(MOCK_TRANSCRIPT)

config_file = tmp_path / "custom_config.yaml"
config_file.write_text(MOCK_CONVERSATION_CONFIG)
config_file.write_text(MOCK_CONVERSATION_CONFIG, encoding="utf-8")

return {
"url_file": str(url_file),
Expand Down Expand Up @@ -139,7 +139,7 @@ def test_generate_transcript_only(sample_config):
transcript_path
), f"Transcript file does not exist at path: {transcript_path}"

with open(transcript_path, "r") as f:
with open(transcript_path, "r", encoding="utf-8") as f:
content = f.read()
assert content != ""
assert isinstance(content, str)
Expand Down Expand Up @@ -209,7 +209,7 @@ def test_generate_podcast_with_custom_config(mock_files, sample_config):
# Check for elements from the custom config in the transcript
transcript_path = audio_path.replace(".mp3", ".txt")
assert os.path.exists(transcript_path)
with open(transcript_path, "r") as f:
with open(transcript_path, "r", encoding="utf-8") as f:
content = f.read()
assert "Teachfy" in content
assert "Learning Through Conversation" in content
Expand Down Expand Up @@ -239,7 +239,7 @@ def test_generate_transcript_with_local_llm(sample_config):
assert "Transcript generated successfully" in result.stdout
transcript_path = result.stdout.split(": ")[-1].strip()
assert os.path.exists(transcript_path)
with open(transcript_path, "r") as f:
with open(transcript_path, "r", encoding="utf-8") as f:
content = f.read()
assert content != ""
assert isinstance(content, str)
Expand Down Expand Up @@ -325,7 +325,7 @@ def test_generate_transcript_only_with_custom_llm():
assert transcript_path.endswith(".txt")

# Verify transcript content
with open(transcript_path, "r") as f:
with open(transcript_path, "r", encoding="utf-8") as f:
content = f.read()
assert content != ""
assert isinstance(content, str)
Expand Down
9 changes: 4 additions & 5 deletions tests/test_content_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,13 @@ def test_youtube_transcriber(self):
extracted_transcript = transcriber.extract_transcript(test_url)

# Load expected transcript from youtube.txt file
with open("./tests/data/mock/youtube.txt", "r") as f:
with open("./tests/data/mock/youtube.txt", "r", encoding="utf-8") as f:
expected_transcript = f.read()

# Assert that the first 100 characters of the extracted transcript match the expected transcript
self.assertEqual(
extracted_transcript[:100].strip(), expected_transcript[:100].strip()
)

def test_website_extractor(self):
"""
Test the WebsiteExtractor class to ensure it correctly extracts content from a website.
Expand All @@ -48,13 +47,13 @@ def test_website_extractor(self):
extractor = WebsiteExtractor()

# Test URL
test_url = "http://www.souzatharsis.com"
test_url = "http://www.example.com"

# Extract content
extracted_content = extractor.extract_content(test_url)
print(extracted_content.strip())
# Load expected content from website.md file
with open("./tests/data/mock/website.md", "r") as f:
with open("./tests/data/mock/website.md", "r", encoding="utf-8") as f:
expected_content = f.read()
print(expected_content.strip())
# Assert that the extracted content matches the expected content
Expand All @@ -74,7 +73,7 @@ def test_pdf_extractor(self):
extracted_content = extractor.extract_content(pdf_path)

# Load expected content from file.txt
with open("./tests/data/mock/file.txt", "r") as f:
with open("./tests/data/mock/file.txt", "r", encoding="utf-8") as f:
expected_content = f.read()

# Assert that the first 500 characters of the extracted content match the expected content
Expand Down
2 changes: 1 addition & 1 deletion tests/test_genai_podcast.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def test_generate_qa_content_from_images(self):
self.assertIsInstance(result, str)

# Check if the output file was created and contains the same content
with open(temp_file.name, "r") as f:
with open(temp_file.name, "r", encoding="utf-8") as f:
file_content = f.read()

self.assertEqual(result, file_content)
Expand Down
8 changes: 4 additions & 4 deletions tests/test_generate_podcast.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ def test_generate_podcast_from_transcript_file(sample_conversation_config):
.get("transcripts"),
"test_transcript.txt",
)
with open(transcript_file, "w") as f:
with open(transcript_file, "w", encoding="utf-8") as f:
f.write(
"<Person1>Joe Biden and the US Politics</Person1><Person2>Joe Biden is the current president of the United States of America</Person2>"
)
Expand Down Expand Up @@ -339,7 +339,7 @@ def test_generate_transcript_with_user_instructions(
).get("output_directories", {}).get("transcripts")

# Read the generated transcript
with open(result, "r") as f:
with open(result, "r", encoding="utf-8") as f:
content = f.read()

assert (
Expand Down Expand Up @@ -394,7 +394,7 @@ def test_generate_transcript_only_with_custom_llm(
).get("output_directories", {}).get("transcripts")

# Read and verify the content
with open(result, "r") as f:
with open(result, "r", encoding="utf-8") as f:
content = f.read()

# Verify the content follows the Person1/Person2 format
Expand Down Expand Up @@ -427,7 +427,7 @@ def test_generate_longform_transcript(sample_config, default_conversation_config
assert result.endswith(".txt")

# Read and verify the content
with open(result, "r") as f:
with open(result, "r", encoding="utf-8") as f:
content = f.read()

# Verify the content follows the Person1/Person2 format
Expand Down
Loading