souzatharsis · Marxyz · Feb 20, 2025 · Feb 20, 2025
diff --git a/podcastfy/client.py b/podcastfy/client.py
@@ -73,7 +73,7 @@ def process_content(
 
         if transcript_file:
             logger.info(f"Using transcript file: {transcript_file}")
-            with open(transcript_file, "r") as file:
+            with open(transcript_file, "r", encoding="utf-8") as file:
                 qa_content = file.read()
         else:
             # Initialize content_extractor if needed
@@ -209,7 +209,7 @@ def main(
         conversation_config = None
         # Load conversation config if provided
         if conversation_config_path:
-            with open(conversation_config_path, "r") as f:
+            with open(conversation_config_path, "r", encoding='utf-8') as f:
                 conversation_config: Dict[str, Any] | None = yaml.safe_load(f)
 
         # Use default TTS model from conversation config if not specified
@@ -360,7 +360,7 @@ def generate_podcast(
         else:
             urls_list = urls or []
             if url_file:
-                with open(url_file, "r") as file:
+                with open(url_file, "r", encoding="utf-8") as file:
                     urls_list.extend([line.strip() for line in file if line.strip()])
 
             if not urls_list and not image_paths and not text and not topic:

diff --git a/podcastfy/content_generator.py b/podcastfy/content_generator.py
@@ -9,6 +9,7 @@
 import os
 from typing import Optional, Dict, Any, List
 import re
+import unicodedata
 
 
 from langchain_community.chat_models import ChatLiteLLM
@@ -833,6 +834,10 @@ def __compose_prompt(self, num_images: int, longform: bool=False):
         composed_prompt_template = ChatPromptTemplate.from_messages(combined_messages)
 
         return composed_prompt_template, image_path_keys
+
+    def __sanitize_unicode_text(self, text: str) -> str:
+        text = unicodedata.normalize('NFKC', text)
+        return text
 
     def generate_qa_content(
         self,
@@ -894,12 +899,15 @@ def generate_qa_content(
                 self.response,
                 self.content_generator_config
             )
-
+
+            # Sanitize unicode response
+            self.response = self.__sanitize_unicode_text(self.response)
+
             logger.info(f"Content generated successfully")
 
             # Save output if requested
             if output_filepath:
-                with open(output_filepath, "w") as file:
+                with open(output_filepath, "w", encoding="utf-8") as file:
                     file.write(self.response)
                 logger.info(f"Response content saved to {output_filepath}")
                 print(f"Transcript saved to {output_filepath}")

diff --git a/podcastfy/content_parser/website_extractor.py b/podcastfy/content_parser/website_extractor.py
@@ -49,7 +49,7 @@ def extract_content(self, url: str) -> str:
 			headers = {'User-Agent': self.user_agent}
 			response = requests.get(normalized_url, headers=headers, timeout=self.timeout)
 			response.raise_for_status()  # Raise an exception for bad status codes
-
+			response.encoding = "utf-8"
 			# Parse the page content with BeautifulSoup
 			soup = BeautifulSoup(response.text, 'html.parser')
 

diff --git a/podcastfy/content_parser/youtube_transcriber.py b/podcastfy/content_parser/youtube_transcriber.py
@@ -55,7 +55,7 @@ def main(seed: int = 42) -> None:
 
 		# Save transcript to file
 		output_file = 'tests/data/transcripts/youtube_transcript2.txt'
-		with open(output_file, 'w') as file:
+		with open(output_file, 'w', encoding="utf-8") as file:
 			file.write(transcript)
 
 		print(f"Transcript saved to {output_file}")

diff --git a/podcastfy/utils/config.py b/podcastfy/utils/config.py
@@ -59,7 +59,7 @@ def __init__(self, config_file: str = 'config.yaml'):
 
 		config_path = get_config_path(config_file)
 		if config_path:
-			with open(config_path, 'r') as file:
+			with open(config_path, 'r', encoding='utf-8') as file:
 				self.config: Dict[str, Any] = yaml.safe_load(file)
 		else:
 			print("Could not locate config.yaml")

diff --git a/podcastfy/utils/config_conversation.py b/podcastfy/utils/config_conversation.py
@@ -158,7 +158,7 @@ def _load_default_config(self) -> Dict[str, Any]:
 		"""Load the default configuration from conversation_config.yaml."""
 		config_path = get_conversation_config_path()
 		if config_path:
-			with open(config_path, 'r') as file:
+			with open(config_path, 'r', encoding='utf-8') as file:
 				return yaml.safe_load(file)
 		else:
 			raise FileNotFoundError("conversation_config.yaml not found")

diff --git a/tests/data/mock/website.md b/tests/data/mock/website.md
@@ -1 +1 @@
-Tharsis Souza, PhD Tharsis Souza is a computer scientist passionate about data-driven products. He is Senior Vice President of Product Management, Modeling Engineering at Two Sigma Investments and Lecturer at Columbia University, Faculty member of the MSc. in Applied Analytics program. Prior to Two Sigma, he spent 10+ years delivering new technology products in a variety of companies from start-ups to Fortune 500’s in the U.S., Brazil, and the U.K. He’s an author of scholarly publications and a regular speaker in academic and business conferences. He also enjoys mentoring under-represented students & working professionals. Tharsis holds a Ph.D. in Computer Science from UCL, University of London following an M.Phil. and M.Sc. in Computer Science and a B.Sc. in Computer Engineering. Selected Interviews and Talks Mentorship Spotlight: Tharsis Souza, Two Sigma FactSet Investment Process Symposium - Innovative Data Panel BattleFin Alternative Data - Interview Beryl Elites - The Disruptors in Investment Management
+Example Domain Example Domain This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission. More information...
diff --git a/tests/test_client.py b/tests/test_client.py
@@ -36,7 +36,7 @@
   - Quiz
   - Conclusion
 podcast_name: Teachfy
-podcast_tagline: Learning Through Conversation
+podcast_tagline: Learning Through Conversation - Witaj świecie! こんにちは世界 
 output_language: English
 engagement_techniques: 
   - examples
@@ -57,7 +57,7 @@ def mock_files(tmp_path):
     transcript_file.write_text(MOCK_TRANSCRIPT)
 
     config_file = tmp_path / "custom_config.yaml"
-    config_file.write_text(MOCK_CONVERSATION_CONFIG)
+    config_file.write_text(MOCK_CONVERSATION_CONFIG, encoding="utf-8")
 
     return {
         "url_file": str(url_file),
@@ -139,7 +139,7 @@ def test_generate_transcript_only(sample_config):
         transcript_path
     ), f"Transcript file does not exist at path: {transcript_path}"
 
-    with open(transcript_path, "r") as f:
+    with open(transcript_path, "r", encoding="utf-8") as f:
         content = f.read()
         assert content != ""
         assert isinstance(content, str)
@@ -209,7 +209,7 @@ def test_generate_podcast_with_custom_config(mock_files, sample_config):
     # Check for elements from the custom config in the transcript
     transcript_path = audio_path.replace(".mp3", ".txt")
     assert os.path.exists(transcript_path)
-    with open(transcript_path, "r") as f:
+    with open(transcript_path, "r", encoding="utf-8") as f:
         content = f.read()
         assert "Teachfy" in content
         assert "Learning Through Conversation" in content
@@ -239,7 +239,7 @@ def test_generate_transcript_with_local_llm(sample_config):
     assert "Transcript generated successfully" in result.stdout
     transcript_path = result.stdout.split(": ")[-1].strip()
     assert os.path.exists(transcript_path)
-    with open(transcript_path, "r") as f:
+    with open(transcript_path, "r", encoding="utf-8") as f:
         content = f.read()
         assert content != ""
         assert isinstance(content, str)
@@ -325,7 +325,7 @@ def test_generate_transcript_only_with_custom_llm():
     assert transcript_path.endswith(".txt")
 
     # Verify transcript content
-    with open(transcript_path, "r") as f:
+    with open(transcript_path, "r", encoding="utf-8") as f:
         content = f.read()
         assert content != ""
         assert isinstance(content, str)

diff --git a/tests/test_content_parser.py b/tests/test_content_parser.py
@@ -29,14 +29,13 @@ def test_youtube_transcriber(self):
         extracted_transcript = transcriber.extract_transcript(test_url)
 
         # Load expected transcript from youtube.txt file
-        with open("./tests/data/mock/youtube.txt", "r") as f:
+        with open("./tests/data/mock/youtube.txt", "r", encoding="utf-8") as f:
             expected_transcript = f.read()
 
         # Assert that the first 100 characters of the extracted transcript match the expected transcript
         self.assertEqual(
             extracted_transcript[:100].strip(), expected_transcript[:100].strip()
         )
-
     def test_website_extractor(self):
         """
         Test the WebsiteExtractor class to ensure it correctly extracts content from a website.
@@ -48,13 +47,13 @@ def test_website_extractor(self):
         extractor = WebsiteExtractor()
 
         # Test URL
-        test_url = "http://www.souzatharsis.com"
+        test_url = "http://www.example.com"
 
         # Extract content
         extracted_content = extractor.extract_content(test_url)
         print(extracted_content.strip())
         # Load expected content from website.md file
-        with open("./tests/data/mock/website.md", "r") as f:
+        with open("./tests/data/mock/website.md", "r", encoding="utf-8") as f:
             expected_content = f.read()
         print(expected_content.strip())
         # Assert that the extracted content matches the expected content
@@ -74,7 +73,7 @@ def test_pdf_extractor(self):
         extracted_content = extractor.extract_content(pdf_path)
 
         # Load expected content from file.txt
-        with open("./tests/data/mock/file.txt", "r") as f:
+        with open("./tests/data/mock/file.txt", "r", encoding="utf-8") as f:
             expected_content = f.read()
 
         # Assert that the first 500 characters of the extracted content match the expected content

diff --git a/tests/test_genai_podcast.py b/tests/test_genai_podcast.py
@@ -89,7 +89,7 @@ def test_generate_qa_content_from_images(self):
         self.assertIsInstance(result, str)
 
         # Check if the output file was created and contains the same content
-        with open(temp_file.name, "r") as f:
+        with open(temp_file.name, "r", encoding="utf-8") as f:
             file_content = f.read()
 
         self.assertEqual(result, file_content)

diff --git a/tests/test_generate_podcast.py b/tests/test_generate_podcast.py
@@ -170,7 +170,7 @@ def test_generate_podcast_from_transcript_file(sample_conversation_config):
         .get("transcripts"),
         "test_transcript.txt",
     )
-    with open(transcript_file, "w") as f:
+    with open(transcript_file, "w", encoding="utf-8") as f:
         f.write(
             "<Person1>Joe Biden and the US Politics</Person1><Person2>Joe Biden is the current president of the United States of America</Person2>"
         )
@@ -339,7 +339,7 @@ def test_generate_transcript_with_user_instructions(
     ).get("output_directories", {}).get("transcripts")
 
     # Read the generated transcript
-    with open(result, "r") as f:
+    with open(result, "r", encoding="utf-8") as f:
         content = f.read()
 
     assert (
@@ -394,7 +394,7 @@ def test_generate_transcript_only_with_custom_llm(
     ).get("output_directories", {}).get("transcripts")
 
     # Read and verify the content
-    with open(result, "r") as f:
+    with open(result, "r", encoding="utf-8") as f:
         content = f.read()
 
     # Verify the content follows the Person1/Person2 format
@@ -427,7 +427,7 @@ def test_generate_longform_transcript(sample_config, default_conversation_config
     assert result.endswith(".txt")
 
     # Read and verify the content
-    with open(result, "r") as f:
+    with open(result, "r", encoding="utf-8") as f:
         content = f.read()
 
     # Verify the content follows the Person1/Person2 format
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		Tharsis Souza, PhD Tharsis Souza is a computer scientist passionate about data-driven products. He is Senior Vice President of Product Management, Modeling Engineering at Two Sigma Investments and Lecturer at Columbia University, Faculty member of the MSc. in Applied Analytics program. Prior to Two Sigma, he spent 10+ years delivering new technology products in a variety of companies from start-ups to Fortune 500’s in the U.S., Brazil, and the U.K. He’s an author of scholarly publications and a regular speaker in academic and business conferences. He also enjoys mentoring under-represented students & working professionals. Tharsis holds a Ph.D. in Computer Science from UCL, University of London following an M.Phil. and M.Sc. in Computer Science and a B.Sc. in Computer Engineering. Selected Interviews and Talks Mentorship Spotlight: Tharsis Souza, Two Sigma FactSet Investment Process Symposium - Innovative Data Panel BattleFin Alternative Data - Interview Beryl Elites - The Disruptors in Investment Management
		Example Domain Example Domain This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission. More information...