Hearing "mistletoe" in almost every other christmas song made me feel a bit bizzare, while I was exposed to some christmas culture growing up back home in Saudi Arabia, I didn't understand the relevance of mistletoes to Christmas. I eventually learned that and, like any sane person, my first instinct was to quantify that... okay fine; like any sane data scientist.
I scraped the lyrics of 69 Popular Christmas Songs and compared christmas word occurance to "average" word occurance (Scraped from the playlist 500 greatest songs) to find the most unique christmas lyrics and how often they occur.
Turns out, 'mistletoe' was only the 14th most christimas-unique word according to this analysis; you can hover just over the 70 mark on the x axis and find it.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import re
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
PATH = "D:\Program Files\Selenium"
browser = webdriver.Chrome()
browser.get("https://www.countryliving.com/life/entertainment/g29326536/best-christmas-songs/")
songs = browser.find_elements_by_class_name("listicle-slide-hed-text")
len(songs)
songLyrics = pd.DataFrame(columns=['title', 'artist', 'lyrics'])
for song in songs:
s = song.text.split(" by ")
if(len(s) == 1):
s = song.text.split(" from ")
songLyrics = songLyrics.append({'title': s[0].replace("\"", ""), 'artist': s[1]}, ignore_index=True)
songLyrics.head(5)
browser.get("https://genius.com/")
def fillLyrics(song):
try:
searchBar = browser.find_element_by_name("q")
if(song.title[0] == "("):
songTitle = song.title.replace("(", "")
else:
songTitle = song.title
searchBar.send_keys(songTitle, " ", song.artist)
searchBar.send_keys(Keys.RETURN)
time.sleep(3)
browser.find_element_by_class_name("mini_card").click()
text = browser.find_element_by_tag_name("p").text
#Some regex to clean up the lyrics
text = re.sub("(\[.+])", "", text) #ignoring [Chorus], [Verse 1], ..etc
text = re.sub("(,|\.|!|)", "", text)
text = text.replace("\n", " ")
songLyrics.loc[songLyrics.title == song.title, "lyrics"] = text
except:
print(song.title, " by ", song.artist, " failed")
songLyrics.apply(lambda x: fillLyrics(x), axis=1)
songLyrics.sample(4)
Link mentions Rolling Stones, but playlist is by various artists and the most comprehensive playlist I could find.
browser.get("https://genius.com/Rolling-stone-the-500-greatest-songs-of-all-time-annotated")
links = browser.find_element_by_tag_name("p").find_elements_by_tag_name("a")
songLinks = []
for i in range(len(links)):
if not("artists" in links[i].get_attribute("href")):
songLinks.append(i)
curSong = 0
top500 = pd.DataFrame(columns = ["songName", "lyrics"])
for i in songLinks:
try:
links = browser.find_element_by_tag_name("p").find_elements_by_tag_name("a")
songName = links[i].text
links[i].click()
text = browser.find_element_by_tag_name("p").text
#Some regex to clean up the lyrics
text = re.sub("(\[.+])", "", text) #ignoring [Chorus], [Verse 1], ..etc
text = re.sub("(,|\.|!|)", "", text)
text = text.replace("\n", " ")
top500 = top500.append({'songName': songName, 'lyrics': text}, ignore_index=True)
browser.back()
except:
print(i, " failed")
top500.sample(5)
wordCount = pd.DataFrame(columns = ["word", "christmasCt", "regCt", "ratio"])
def countWords(lyrics, dictionary):
try:
words = lyrics.split(" ")
global wordCount
for curWord in words:
curWord = curWord.lower()
if (wordCount.loc[wordCount.word == curWord].shape[0] == 0):
wordCount = wordCount.append({'word': curWord, 'christmasCt': 0, 'regCt': 0}, ignore_index=True)
wordCount.loc[wordCount.word == curWord, dictionary] += 1
except:
print("Failure")
songLyrics.apply(lambda x: countWords(x.lyrics, "christmasCt"), axis=1) # Counting words in the christmas Playlist
top500.apply(lambda x: countWords(x.lyrics, "regCt"), axis=1) # Counting words in top 500 playlist
wordCount["christmasPerSong"] = wordCount.christmasCt / 69
wordCount["regPerSong"] = wordCount.regCt / top500.shape[0]
wordCount.loc[wordCount.word == "christmas"]
def div(a, b):
if(b == 0): # Preventing division by zero
b = 1/485
return a / b
wordCount["christmasPerReg"] = wordCount.apply(lambda x: div(x.christmasPerSong, x.regPerSong), axis=1)
wordCount["christmasRtg"] = wordCount.christmasPerReg * wordCount.christmasCt
wordCount.sort_values(by="christmasRtg", ascending=False).head(15)
wordCount.loc[wordCount.regCt == 0]
wordCount.to_csv("christmasVReg.csv")
browser.close()