NPR2Spotify/NPRSpotifySearch.py at master · Sockemboffer/NPR2Spotify · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
import re
import json
import requests
import time
from urllib import parse
from collections import Counter
from unidecode import unidecode
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from difflib import SequenceMatcher
from ratelimit import limits, RateLimitException
from backoff import on_exception, expo
import Secrets

NUMBER_OF_CALLS = 5
IN_SECONDS = 1

# TODO create a way to make corrections and updates
    # TODO listener sends track correction (incorrect song, incorrect rendition), same as above
        # TODO replace current track with correct?
        # TODO missing, replace, add, found, duplicate entry (but show had unique uncaptured)
class NPRSpotifySearch:

    def __init__(self):
        self.requestSession = requests.Session()
        self.secrets = Secrets.Secrets()
        self.retries = Retry(total=10, backoff_factor=1, status_forcelist=[ 204, 304, 400, 401, 403, 404, 500, 502, 503, 504 ])
        self.requestSession.mount('https://api.spotify.com/', HTTPAdapter(max_retries=self.retries, pool_maxsize=25))
        self.secretsSession = Secrets.Secrets()
        self.nprTrackName = None
        self.nprArtistsName = list()
        self.track = None
        self.artists = list()

    def SearchSpotify(self, track, artists, user_id):
        trackCopy = track
        trackResponses = list()
        auxiliaryList = list()
        if artists == None:
            artists = list("") # we could still search and accept a track result hit without an artist entry?
        else:
            for artist in artists:
                artist = self.RemoveCommonPhrasesArtists(self.RemoveParenthesis(self.RemoveBrackets(artist)))
                artist = artist.split()
                artist.extend(artists)
                for word in artist:
                    if word not in auxiliaryList:
                        auxiliaryList.append(word)
            auxiliaryList.append("") # Need an empty string for when track match is really high but artist is 0
        for artist in auxiliaryList:
            artistResponses = list()
            # TODO could probably drop all non-alphanumeric instead of an incremental hunt and drop approach
            artistResponses.append(self.SearchExplicitTrackAndArtist(unidecode(trackCopy), unidecode(artist), user_id))
            trackCopy = self.RemoveBrackets(unidecode(trackCopy))
            artistResponses.append(self.SearchExplicitTrackAndArtist(unidecode(trackCopy), unidecode(artist), user_id))
            trackCopy = self.RemoveParenthesis(unidecode(track))
            artistResponses.append(self.SearchExplicitTrackAndArtist(unidecode(trackCopy), unidecode(artist), user_id))
            trackCopy = self.RemoveCommonPhrasesTracks(unidecode(trackCopy))
            artistResponses.append(self.SearchExplicitTrackAndArtist(unidecode(trackCopy), unidecode(artist), user_id))
            trackCopy = self.RemoveNumbers(unidecode(trackCopy))
            artistResponses.append(self.SearchExplicitTrackAndArtist(unidecode(trackCopy), unidecode(artist), user_id))
            artistResponses.append(self.SearchImplicitTrackExplicitArtist(unidecode(trackCopy), unidecode(artist), user_id))
            artistResponses.append(self.SearchExplicitTrackAndArtist(unidecode(trackCopy.split("(")[0]), unidecode(artist), user_id))
            artistResponses.append(self.SearchExplicitTrackAndArtist(unidecode(trackCopy.split("[")[0]), unidecode(artist), user_id))
            artistResponses.append(self.SearchExplicitTrackAndArtist(unidecode(trackCopy.partition(":")[2]), unidecode(artist), user_id))
            # hail Marry's
            artistResponses.append(self.SearchImplicitTrackImplicitArtist(unidecode(track), unidecode(self.ReplaceAmpersand(artist)), user_id))
            artistResponses.append(self.SearchImplicitTrackAndArtistCombined(unidecode(track), unidecode(self.ReplaceAmpersand(artist)), user_id))
            trackResponses.append(artistResponses)
        print("\"{0}\" by \"{1}\" searched.".format(track, str(artists)))
        bestChoice = self.ChooseBestMatch(trackResponses, track, artists)
        return bestChoice

    # Using libdiff to create a match threshhold of sorts.
    def ChooseBestMatch(self, responses, nprTrack, nprArtists):
        bestMatch = dict()
        bestMatch["Result Track Name"] = None
        bestMatch["Result Artist Names"] = list()
        bestMatch["Result Track-Match Percent"] = 0.0
        bestMatch["Result Artists-Match Percent"] = 0.0
        bestMatch["Result Track URI"] = None
        if responses == None or len(responses) == 0.0:
            print("hmmm...")
        # this is all really gross looking
        else:
            for response in responses:
                for result in response:
                    if len(result["tracks"]["items"]) != 0.0:
                        for item in result["tracks"]["items"]:
                            # if item == None: # got a none one time, pretty rare, triples the time to make a playlist
                            resultTrackName = item["name"]
                            resultTrackNameSplit = list() # split track name words into a new list for comparison later
                            for word in resultTrackName.split(): # go through each word in track name
                                word = re.sub(r'[^\w]', ' ', word).lower().strip() # remove non-alphanumerics
                                resultTrackNameSplit.extend(word.split()) # split strings that end up like 'no 1'
                            resultTrackNameSplit = list(filter(None, resultTrackNameSplit)) # get rid of empty strings
                            nprTrackNameSplit = list()
                            for word in nprTrack.split():
                                word = re.sub(r'[^\w]', ' ', word).lower().strip()
                                nprTrackNameSplit.extend(word.split())
                            nprTrackNameSplit = list(filter(None, nprTrackNameSplit))
                            resultMatchesToNPR = [sub for sub in resultTrackNameSplit if sub in nprTrackNameSplit] # see what of the result matches the npr name
                            nprRemovedPhrasesArtists = list()
                            for artist in nprArtists:
                                artist = self.RemoveCommonPhrasesArtists(artist)
                                artist = re.sub(r'[^\w]', ' ', artist).lower().strip()
                                nprRemovedPhrasesArtists.extend(artist.split())
                            nprRemovedPhrasesArtists = list(filter(None, nprRemovedPhrasesArtists))
                            resultTrackArtistNames = item["artists"]
                            resultArtistNames = list() # need to split first, last, misc names into individual strings
                            resultArtistNamesCopy = list() # copy to store later in best match
                            for artist in resultTrackArtistNames:
                                resultArtistNamesCopy.append(artist["name"]) # copy to store later in best match
                                artist["name"] = re.sub(r'[^\w]', ' ', artist["name"]).lower().strip()
                                resultArtistNames.extend(artist["name"].split())
                            resultArtistNames = list(filter(None, resultArtistNames))
                            resultArtistNPRMatches = [sub for sub in resultArtistNames if sub in nprRemovedPhrasesArtists]
                            resultsArtistsSet = set(resultArtistNPRMatches)
                            nprArtistNamesSet = set(nprRemovedPhrasesArtists)
                            resultsTrackNameSet = set(resultMatchesToNPR)
                            nprTrackNameSet = set(nprTrackNameSplit)
                            # First, a quick exact-match check
                            if nprArtistNamesSet == resultsArtistsSet and nprTrackNameSet == resultsTrackNameSet or bestMatch["Result Track-Match Percent"] == 1.0 and bestMatch["Result Artists-Match Percent"] == 1.0: # check artist names first, less likely to match
                                bestMatch["Result Track Name"] = item["name"]
                                bestMatch["Result Artist Names"] = resultArtistNamesCopy
                                bestMatch["Result Album Name"] = item["album"]["name"]
                                bestMatch["Result Track-Match Percent"] = 1.0
                                bestMatch["Result Artists-Match Percent"] = 1.0
                                bestMatch["Result Track URI"] = item["uri"]
                                return bestMatch
                            # Fun weighting(?) results-land
                            else:
                                seqTrack = SequenceMatcher(a=nprTrackNameSplit, b=resultTrackNameSplit)
                                seqArtist = SequenceMatcher(a=nprRemovedPhrasesArtists, b=resultArtistNames)
                                trackMatchScore = seqTrack.ratio()
                                artistsMatchScore = seqArtist.ratio()
                                if artistsMatchScore >= 0.5 and artistsMatchScore >= bestMatch["Result Artists-Match Percent"]: # high artist name accuracy
                                    if trackMatchScore >= 0.75 and trackMatchScore >= bestMatch["Result Track-Match Percent"]: # good chance at match
                                        bestMatch["Result Track Name"] = item["name"]
                                        bestMatch["Result Artist Names"] = resultArtistNamesCopy
                                        bestMatch["Result Album Name"] = item["album"]["name"]
                                        bestMatch["Result Track-Match Percent"] = trackMatchScore
                                        bestMatch["Result Artists-Match Percent"] = artistsMatchScore
                                        bestMatch["Result Track URI"] = item["uri"]
                                if trackMatchScore >= 0.5 and trackMatchScore >= bestMatch["Result Track-Match Percent"]: # high artist name accuracy
                                    if  artistsMatchScore >= 0.65 and artistsMatchScore >= bestMatch["Result Artists-Match Percent"]: # good chance at match
                                        bestMatch["Result Track Name"] = item["name"]
                                        bestMatch["Result Artist Names"] = resultArtistNamesCopy
                                        bestMatch["Result Album Name"] = item["album"]["name"]
                                        bestMatch["Result Track-Match Percent"] = trackMatchScore
                                        bestMatch["Result Artists-Match Percent"] = artistsMatchScore
                                        bestMatch["Result Track URI"] = item["uri"]
            return bestMatch

    def RemoveBrackets(self, track):
        newTrack = track.translate({ord(i): None for i in '[]'})
        return newTrack

    def RemoveParenthesis(self, track):
        newTrack = track.translate({ord(i): None for i in '()'})
        return newTrack

    def RemoveNumbers(self, track):
        newTrack = track.translate({ord(i): None for i in '0'})
        return newTrack

    def RemoveCommonPhrasesTracks(self, track):
        stop_words = ['feat.', 'feat', 'original', 'edit', 'featuring', 'feature']
        stopwords_dict = Counter(stop_words)
        result = ' '.join([word for word in track.lower().split() if word not in stopwords_dict])
        return result

    def RemoveCommonPhrasesArtists(self, track):
        stop_words = ['and', 'various', 'artists', "conducted", "by", "et", "al", "et.", "al."]
        stopwords_dict = Counter(stop_words)
        result = ' '.join([word for word in track.lower().split() if word not in stopwords_dict])
        return result

    def ReplaceAmpersand(self, track):
        result = track.replace('&', 'and')
        return result

    # Explicit Track or Artist means I define a type encoded in what I send: eg. track:"Smells like teen spirit" artist:"Nirvana"
    # without those it can mean different results, etc.
    # TODO better names
    @on_exception(expo, RateLimitException, max_tries=8)
    @limits(calls=NUMBER_OF_CALLS, period=IN_SECONDS)
    def SearchExplicitTrackAndArtist(self, track, artist, user_id):
        query = "https://api.spotify.com/v1/search?q={}&type=track%2Cartist&market=US&limit=5".format(parse.quote('track:' + '"' + track + '"' + ' ' + 'artist:"' + artist + '"'))
        response = self.requestSession.get(query, headers={"Content-Type": "application/json", "Authorization": "Bearer {}".format(self.secrets.GiveToken(user_id))})
        if response.status_code not in [200, 201, 202]:
            raise Exception('API response: {}'.format(response.status_code))
        return response.json()

    @on_exception(expo, RateLimitException, max_tries=8)
    @limits(calls=NUMBER_OF_CALLS, period=IN_SECONDS)
    def SearchImplicitTrackExplicitArtist(self, track, artist, user_id):
        query = "https://api.spotify.com/v1/search?q={}&type=track%2Cartist&market=US&limit=5".format(parse.quote('"' + track + '"' + ' ' + 'artist:"' + artist + '"'))
        response = self.requestSession.get(query, headers={"Content-Type": "application/json", "Authorization": "Bearer {}".format(self.secrets.GiveToken(user_id))})
        if response.status_code not in [200, 201, 202]:
            raise Exception('API response: {}'.format(response.status_code))
        return response.json()

    @on_exception(expo, RateLimitException, max_tries=8)
    @limits(calls=NUMBER_OF_CALLS, period=IN_SECONDS)
    def SearchImplicitTrackImplicitArtist(self, track, artist, user_id):
        query = "https://api.spotify.com/v1/search?q={}&type=track%2Cartist&market=US&limit=5".format(parse.quote('"' + track + '"' + ' ' + '"' + artist + '"'))
        response = self.requestSession.get(query, headers={"Content-Type": "application/json", "Authorization": "Bearer {}".format(self.secrets.GiveToken(user_id))})
        if response.status_code not in [200, 201, 202]:
            raise Exception('API response: {}'.format(response.status_code))
        return response.json()

    @on_exception(expo, RateLimitException, max_tries=8)
    @limits(calls=NUMBER_OF_CALLS, period=IN_SECONDS)
    def SearchImplicitTrackAndArtistCombined(self, track, artist, user_id):
        query = "https://api.spotify.com/v1/search?q={}&type=track&market=US&limit=5".format(parse.quote(str(track + " AND " + artist)))
        response = self.requestSession.get(query, headers={"Content-Type": "application/json", "Authorization": "Bearer {}".format(self.secrets.GiveToken(user_id))})
        if response.status_code not in [200, 201, 202]:
            raise Exception('API response: {}'.format(response.status_code))
        return response.json()