-
Notifications
You must be signed in to change notification settings - Fork 7
Feature/voice analysis #30
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 16 commits
5ec016a
b0d00fd
db5f243
eac60f9
1429e72
f9ffc8e
bfb61a1
fe3f487
29a6e08
ff0f003
63d235d
35043f4
3b95203
e6f0b5f
9da1078
7dee943
4ca2cd4
907cdfe
493e531
70d32bb
1cfab8f
c620f9a
af4dfd0
73e5269
2bda1d5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,101 @@ | ||
| import pandas as pd | ||
| import speech_recognition as sr | ||
| from inaSpeechSegmenter import Segmenter | ||
|
|
||
|
|
||
| class GenderAudioProcessor: | ||
| """Computes """ | ||
| def __init__(self, path_to_file, path_to_audio): | ||
| self.title = path_to_file.split(sep='\\')[-1].split(sep='.')[0] | ||
| self.media = path_to_file | ||
| self.audio = path_to_audio | ||
| self.gendered_audio_seg = self.segment() # Dataframe | ||
| self.dialogues = self.run_speech_to_text() | ||
| self.speaking_time = self.compute_speaking_time_allocation() | ||
|
|
||
| def __str__(self): | ||
| return "Film : {}".format(self.title) | ||
|
|
||
| def __repr__(self): | ||
| return self.title | ||
|
|
||
| def segment(self): | ||
| """Extract time intervals from self.media, according to the speaker's gender. | ||
|
|
||
| Returns: | ||
| pd.DataFrame: Pandas' DataFrame with 3 columns (gender, start, end) and as many lines as needed. | ||
| """ | ||
| seg = Segmenter(vad_engine='sm', energy_ratio=0.05) | ||
| # The higher the energy ratio, the more selective it is ; vad_engine works better with sm than smn | ||
| segment = seg(self.media) | ||
| return pd.DataFrame(list(filter(lambda x: x[0] == 'male' or x[0] == 'female', segment)), | ||
| columns=['gender', 'start', 'end']) | ||
|
|
||
| def search_gender_tag(self, time: int): | ||
| """Retrieves the genre associated with the time given in parameter (in seconds) for a film. | ||
|
|
||
| Requires access to the dataframe generated by the segmentor. | ||
|
|
||
| Parameters: | ||
| time (int): The time of interest, given in seconds. | ||
|
|
||
| Returns: | ||
| gender (str OR None): The gender of the speaker corresponding to the given time. None if out of range. | ||
| """ | ||
| gender = None | ||
| if time > self.gendered_audio_seg['end'].tail(1).item(): | ||
| return None | ||
| for i in self.gendered_audio_seg.index: | ||
| if time > self.gendered_audio_seg['start'][i]: | ||
| if time < self.gendered_audio_seg['end'][i]: | ||
| gender = self.gendered_audio_seg['gender'][i] | ||
| if time > self.gendered_audio_seg['end'][i]: | ||
| pass | ||
| return gender | ||
|
|
||
| def compute_speaking_time_allocation(self): | ||
| speaking_time = {'male': 0, 'female': 0} | ||
| dif = pd.Series(self.gendered_audio_seg['end'] - self.gendered_audio_seg['start'], name='time_frame') | ||
| totaldf = pd.concat([self.gendered_audio_seg['gender'], dif], axis=1) | ||
| for i in totaldf.index: | ||
| if totaldf['gender'][i] == 'male': | ||
| speaking_time['male'] += float(totaldf['time_frame'][i]) | ||
| if totaldf['gender'][i] == 'female': | ||
| speaking_time['female'] += float(totaldf['time_frame'][i]) | ||
| return speaking_time | ||
|
|
||
| def decode_speech(self, start_time=None, end_time=None, language="en-US"): | ||
| r = sr.Recognizer() | ||
| # r.pause_threshold = 3 | ||
| # r.dynamic_energy_adjustment_damping = 0.5 | ||
| # language can be "fr-FR" | ||
|
|
||
| with sr.WavFile(self.audio) as source: | ||
| if start_time is None and end_time is None: | ||
| audio_text = r.record(source) | ||
| else: | ||
| audio_text = r.record(source, duration=end_time - start_time, offset=start_time) | ||
|
|
||
| # recognize_() method will throw a request error if the API is unreachable, hence using exception handling | ||
| try: | ||
| # using google speech recognition | ||
| text = r.recognize_google(audio_text, language=language) | ||
| print('Converting audio transcripts into text ...') | ||
| return text | ||
|
|
||
| except: | ||
| print('Sorry.. run again...') | ||
|
|
||
| def run_speech_to_text(self): | ||
| transcript = [] | ||
| for i in self.gendered_audio_seg.index: | ||
| transcript.append(self.decode_speech(start_time=self.gendered_audio_seg['start'][i], | ||
| end_time=self.gendered_audio_seg['end'][i], | ||
| language='fr-FR')) | ||
| transcription = pd.concat([self.gendered_audio_seg['gender'], pd.Series(transcript, name="transcription")], | ||
| axis=1) | ||
| return transcription | ||
|
|
||
| def export_to_csv(self, file_path: str): | ||
| result = pd.concat([self.gendered_audio_seg, self.dialogues['transcription']], axis=1) | ||
| result.to_csv(path_or_buf=file_path, sep=";", header=True, index=False) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,38 @@ | ||
| from transformers import pipeline | ||
|
DnzzL marked this conversation as resolved.
Outdated
|
||
|
|
||
|
|
||
| class SpeechRecognition: | ||
| """Speech recognition model for audio files.""" | ||
|
|
||
| def __init__(self, model_name="openai/whisper-small"): | ||
| """Initialize speech recognition model. | ||
|
|
||
| Args: | ||
| language (str): target language | ||
| task (str): transcribe for same language or translate to another language | ||
| model_name (str): Whisper model name. Defaults to "openai/whisper-small". | ||
| """ | ||
| self.pipe = pipeline( | ||
| task="automatic-speech-recognition", | ||
| model=model_name, | ||
| chunk_length_s=30, | ||
| stride_length_s=(5, 5), | ||
| return_timestamps=True, | ||
| generate_kwargs={"max_length": 1000}, | ||
| ) | ||
|
|
||
| def transcribe(self, audio_path, language, task="transcribe"): | ||
| """Transcribe audio file. | ||
|
|
||
| Args: | ||
| audio_path (str): Path to audio file | ||
| language (str): target language | ||
| task (str): transcribe for same language or translate to another language | ||
|
|
||
| Returns: | ||
| Dict: Transcribed text | ||
| """ | ||
| self.pipe.model.config.forced_decoder_ids = ( | ||
| self.pipe.tokenizer.get_decoder_prompt_ids(language=language, task=task) | ||
| ) | ||
| return self.pipe(audio_path) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,36 @@ | ||
| import main | ||
|
DnzzL marked this conversation as resolved.
Outdated
|
||
| import pandas as pd | ||
|
|
||
|
|
||
| class AudioProcessor: | ||
| """Computes complete pipeline from audio to text format dialogues | ||
|
|
||
| """ | ||
| def __init__(self, audio_file="", language="en-US"): | ||
| self.audio = audio_file | ||
| self.language = language | ||
| self.gendered_audio_seg = self.gender_segmentor() | ||
| self.feminine_dialogues = self.dialogue_tagger() | ||
| self.result = self.run_speech_to_text() | ||
|
|
||
| def gender_segmentor(self): | ||
| return main.gender_segmentor.segment(self.audio) | ||
|
|
||
| def dialogue_tagger(self): | ||
| return main.dialogue_tagger.extract_dialogues_subsets(self.gendered_audio_seg) | ||
|
|
||
| def run_speech_to_text(self): | ||
| transcript = [] | ||
| for i in self.feminine_dialogues.index: | ||
| duration = self.feminine_dialogues['end'][i] - self.feminine_dialogues['start'][i] | ||
| transcript.append(main.stt_transcriber.speech_to_text(self.audio, | ||
| start_time=self.gendered_audio_seg['start'][i], | ||
| duration=duration, | ||
| language=self.language)) | ||
| transcription = pd.concat([self.gendered_audio_seg['gender'], pd.Series(transcript, name="transcription")], | ||
| axis=1) | ||
| return transcription | ||
|
|
||
| def export_to_csv(self, file_path: str): | ||
| result = pd.concat([self.gendered_audio_seg, self.result['transcription']], axis=1) | ||
| result.to_csv(path_or_buf=file_path, sep=";", header=True, index=False) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,15 @@ | ||
| import transcriber | ||
|
DnzzL marked this conversation as resolved.
Outdated
|
||
| import gender_segmenter | ||
| import dialogue_tagger | ||
|
|
||
|
|
||
| def get_gender_segmentor(): | ||
| return gender_segmenter.InaSpeechSegmentor() | ||
|
|
||
|
|
||
| def get_dialogue_tagger(): | ||
| return dialogue_tagger.RuleBasedTagger() | ||
|
|
||
|
|
||
| def get_gender_transcriber(): | ||
| return transcriber.GoogleSpeechRecognition() | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,18 @@ | ||
| from abc import ABC, abstractmethod | ||
|
DnzzL marked this conversation as resolved.
|
||
|
|
||
|
|
||
| class DialogueTagger(ABC): | ||
| """Abstract Class common to every gender segmentor. | ||
| Convert an audio file to a dataframe of time slots associated with the speaker's gender. | ||
| """ | ||
| @abstractmethod | ||
| def extract_dialogues_subsets(self, segments_dataframe): | ||
| pass | ||
|
|
||
|
|
||
| class RuleBasedTagger(DialogueTagger): | ||
| def __init__(self): | ||
| pass | ||
|
|
||
| def extract_dialogues_subsets(self, segments_dataframe): | ||
| return segments_dataframe | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,29 @@ | ||
| import pandas as pd | ||
| from abc import ABC, abstractmethod | ||
| from inaSpeechSegmenter import Segmenter | ||
|
|
||
|
|
||
| class GenderSegmentor(ABC): | ||
| """Abstract Class common to every gender segmentor. | ||
| Convert an audio file to a dataframe of time slots associated with the speaker's gender. | ||
| """ | ||
| @abstractmethod | ||
| def segment(self, audio_file): | ||
| pass | ||
|
|
||
|
|
||
| class InaSpeechSegmentor(GenderSegmentor): | ||
| def __init__(self): | ||
| self.seg = Segmenter(vad_engine='sm', energy_ratio=0.05) | ||
| # The higher the energy ratio, the more selective it is ; vad_engine works better with sm than smn | ||
|
|
||
| def segment(self, audio_file): | ||
| """Extracts time intervals from audio_file, according to the speaker's gender. | ||
|
|
||
| Returns: | ||
| pd.DataFrame: Pandas' DataFrame with 3 columns (gender, start, end) and as many lines as needed. | ||
| """ | ||
|
|
||
| segment = self.seg(audio_file) | ||
| return pd.DataFrame(list(filter(lambda x: x[0] == 'male' or x[0] == 'female', segment)), | ||
| columns=['gender', 'start', 'end']) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,13 @@ | ||
| import dependencies as dep | ||
| import audio_processor as ap | ||
|
|
||
| gender_segmentor = dep.get_gender_segmentor() | ||
| dialogue_tagger = dep.get_dialogue_tagger() | ||
| stt_transcriber = dep.get_gender_transcriber() | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
|
DnzzL marked this conversation as resolved.
Outdated
|
||
| AP = ap.AudioProcessor(".\\..\\..\\..\\HP4_extract2.wav", language="fr-FR") | ||
| AP.export_to_csv("./HP4_results.csv") | ||
| # AP.compute_speaking_time_allocation() | ||
| # AP.compute_bechdel_scene_duration() | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,33 @@ | ||
| from abc import ABC, abstractmethod | ||
| import speech_recognition as sr | ||
|
TheoLvs marked this conversation as resolved.
|
||
|
|
||
|
|
||
| class Transcriber(ABC): | ||
| # Abstract Class common to every transcriber model. | ||
| # Possibility to deal with audio ranges of a certain duration. | ||
| def __init__(self): | ||
| self.r = sr.Recognizer() | ||
|
|
||
| def read(self, audio_file, start_time=None, duration=None): | ||
| with sr.AudioFile(audio_file) as source: | ||
| return self.r.record(source, duration=duration, offset=start_time) | ||
|
|
||
| @abstractmethod | ||
| def speech_to_text(self, audio_file, start_time, duration, language): | ||
| pass | ||
|
|
||
|
|
||
| class GoogleSpeechRecognition(Transcriber): | ||
| # Recognize speech using Google Speech Recognition | ||
| # Can be improved by adding language choice | ||
| def __init__(self): | ||
| super().__init__() | ||
|
|
||
| def speech_to_text(self, audio_file, start_time, duration, language): | ||
| audio = self.read(audio_file, start_time, duration) | ||
| try: | ||
| return self.r.recognize_google(audio, language) | ||
| except sr.UnknownValueError: | ||
| print("Google Speech Recognition could not understand audio") | ||
| except sr.RequestError as e: | ||
| print("Could not request results from Google Speech Recognition service; {0}".format(e)) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,60 @@ | ||
| import os | ||
|
|
||
| import moviepy.editor as mp | ||
| from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip | ||
|
|
||
|
|
||
| def cut_and_save(movie_path: str, start: float, end: float, target_name: str) -> None: | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Il faudrait qu'on rajoute tous les utils sur une vidéo dans la classe bechdelai.video.video.Video
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Pareil qu'avant, on n'a qu'à virer ça pour l'instant, c'est juste des fonctions dont j'avais parfois besoin.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Up ? :) |
||
| """This function cuts a video from the start to the end time and saves it as target_name. | ||
|
|
||
| Parameters: | ||
| movie_path (str): The path to the video file. | ||
| start (float): The start time in seconds. | ||
| end (float): The end time in seconds. | ||
| target_name (str): The file name of the new video file. | ||
|
|
||
| Returns: | ||
| None | ||
| """ | ||
| return ffmpeg_extract_subclip(movie_path, start, end, targetname=target_name) | ||
|
|
||
|
|
||
| def import_as_clip(path_to_video: str) -> mp.VideoFileClip: | ||
| """Imports a video file as a VideoFileClip object. | ||
|
|
||
| Parameters: | ||
| path_to_video (str): Path to a video file. | ||
|
|
||
| Returns: | ||
| mp.VideoFileClip: VideoFileClip object. | ||
| """ | ||
| return mp.VideoFileClip(path_to_video) | ||
|
|
||
|
|
||
| def separate_voice_and_music(path_to_mixed_audio: str) -> None: | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ca vaudrait le coup de séparer la partie Spleeter dans un autre fichier, c'est un peu plus qu'un utils. Et cette partie pourrait évoluer avec d'autres modèles Audio2Audio qui sortent
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Pas sûre de comprendre mais OK. On n'a qu'à virer les utils, ils étaient dans le notebook à la base, on pourra voir ça plus tard. |
||
| """Splits an audio file into its individual parts using spleeter | ||
|
|
||
| Does not work above 700 seconds or about 11 minutes. | ||
|
|
||
| Stores the results in separate folders, upstream of the project root. | ||
|
|
||
| Parameters: | ||
| path_to_mixed_audio (str): Path to an audio file (.wav) | ||
|
|
||
| Returns: | ||
| None | ||
| """ | ||
| os.system('spleeter separate -d 700.0 -o ../../../ -f "{instrument}/{filename}.{codec}" ' + path_to_mixed_audio) | ||
|
|
||
|
|
||
| def extract_audio_from_movie(file: str, extension: str = '.wav') -> None: | ||
| """Extract the audio from a film and save it to a file. | ||
|
|
||
| The audio is saved in the same directory as the film. | ||
|
|
||
| Parameters: | ||
| file (str): The name of the film file to extract the audio from. | ||
| extension (str): The file extension of the audio file to save (default is ".wav"). | ||
| """ | ||
| clip = import_as_clip(file) | ||
| clip.audio.write_audiofile(file.split(sep='.')[0] + extension) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,4 @@ | ||
| path_to_extract=<path_to_extract_file> | ||
| path_to_audio=<path_to_audio_file> | ||
| path_to_full_movie=<path_to_full_movie_file> | ||
| path_to_trailer=<path_to_trailer_file> |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,20 @@ | ||
| #Readme pour l'audio | ||
| ## Installation | ||
| * .env | ||
| * créer un fichier ".env" en local pour y placer le chemin vers la vidéo, comme dans .env.example | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Mettons plutôt les chemins vers des fichiers comme des arguments de fonction, sinon ça rend difficile à utiliser dans des notebooks, dans des apps de démos, et rend le code moins versatile
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Je suis d'accord dans l'idée mais si j'ai fait ça c'est pour éviter que mon chemin soit en dur dans le fichier gender_identification.py. Les chemins vers les fichiers sont bien des arguments de fonction (ou plutôt de classe), ils sont juste remplacés par ceux renseignés dans l'environnement local après un os.getenv. |
||
| * poetry update / poetry install | ||
| * poetry run python .\gender_identification.py | ||
| * ffmpeg codex (pour Windows, suivre les instructions [ici](https://www.geeksforgeeks.org/how-to-install-ffmpeg-on-windows/) - | ||
| pas besoin de mettre à la racine en admin et de redémarrer ; | ||
| pour ubuntu `$ sudo apt-get install ffmpeg`, voir la [doc du projet](https://github.com/ina-foss/inaSpeechSegmenter)) | ||
|
|
||
| * Pour installer `spleeter` de Deezer, la version de NumPy doit être entre 1.16.0 et 1.19.5. | ||
| Ce qui implique de changer également la version de tensorflow. Pour éviter de faire imploser les dépendances, il faut créer un environnement virtuel. | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Faut qu'on trouve un moyen de rendre tout compatible, sinon on va pas pouvoir rendre l'installation possible avec Poetry, et donc rendre compliqué la démo derrière
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Spleeter se lance en ligne de commande et enregistre des fichiers sur l'ordi. Je pense que c'est de toute façon pas viable dans un projet mais ça reste bien pratique. Si tu vois une alternative intégrable, je veux bien que tu me guides là dessus. |
||
| * Installez virtualenv par votre moyen préféré | ||
| * Créez un environnement virtuel dans le dossier "audio" en lançant `virtualenv venv` (ici, il s'appelle venv) | ||
| * Activez venv avec les commandes suivantes : | ||
| * Linux : `source venv/bin/activate` | ||
| * Windows : (`set-executionpolicy unrestricted` si besoin) `.\venv\Scripts\activate` | ||
| * Installez les bilbiothèques indiquées dans le requirement.txt | ||
| * Exécutez normalement le code python | ||
| * Quittez venv avec `deactivate` | ||
Uh oh!
There was an error while loading. Please reload this page.