import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re


rawResults = pd.read_csv("ACMExtended.csv")


rawResults.head()


# A quick look to see how many offers respondents received
rawResults.numOffers.value_counts()

0    37
1    33
2    25
3     7
4     6
5     2
6     1
Name: numOffers, dtype: int64


# Stratifying respondents that recieved no offers and 1+ offers
rawResults["gotOffer"] = np.zeros(rawResults.shape[0])
rawResults.loc[rawResults.numOffers > 0, "gotOffer"] = 1
offersN = rawResults.gotOffer.value_counts()
offersN

1.0    74
0.0    37
Name: gotOffer, dtype: int64


totalN = rawResults.shape[0]
totalN

111


offersVsNon = rawResults.groupby("gotOffer")


##TODO MOVE

rawResults.groupby("bigTechOffers")[["avgIntvPrepHrs", "avgIntvPrepSpan"]].mean()


round(offersVsNon[["avgIntvPrepHrs", "avgIntvPrepSpan"]].mean(), 2)


offersVsNon[["avgIntvPrepHrs", "avgIntvPrepSpan"]].median()


supplimentary = offersVsNon[["inclGithub", "inclPersonalProj", "inclSchoolProj", "inclWebsite", "inclLinkedIn"]]
supplimentarySums = supplimentary.sum()
supplimentaryPct = supplimentary.mean()


round(supplimentaryPct, 2) #Percent from offered (vs non offered) applicants included the following 
# Percent from specific sample pool


round(supplimentarySums/totalN*100, 2) # Percent from overall sample pool


rawResults.groupby("inclWebsite").gotOffer.mean() # percent of applicants who received offers (website included vs not)

inclWebsite
0    0.595506
1    0.954545
Name: gotOffer, dtype: float64


supplimentary = rawResults.groupby("bigTechOffers")[["inclGithub", "inclPersonalProj", "inclSchoolProj", "inclWebsite", "inclLinkedIn"]]
btSums = supplimentary.sum()
btPct = supplimentary.mean()


btPct


# Sum of each perp resource used.
prep = rawResults[["gotOffer", "leetCodePrep", "crackingTheCodePrep", "youtubePrep", "mockInterviewPrep", "hackerrankPrep", "crackingThePmPrep", "492Prep"]]
prep.sum()

gotOffer                74.0
leetCodePrep           105.0
crackingTheCodePrep     57.0
youtubePrep             44.0
mockInterviewPrep       39.0
hackerrankPrep          39.0
crackingThePmPrep        1.0
492Prep                  1.0
dtype: float64


round(prep.groupby("gotOffer").mean()*100, 2) # percent of respective sample used each resource.


round(prep.leetCodePrep.mean(), 2) # LeetCode usage percent from total sample

0.95


def hourGrp(hrs): # Defining 4 main hour groups for visualization
    if (hrs < 4):
        return "1-3"
    elif (hrs < 8):
        return "4-7"
    elif (hrs < 15):
        return "8-14"
    else:
        return "15+"
rawResults["prepHoursGrp"] = rawResults.apply(lambda x: hourGrp(x.avgIntvPrepHrs), axis=1)


rawResults[["avgIntvPrepHrs", "prepHoursGrp"]]


import statsmodels.formula.api as smf


re.findall("\d+", "more than 50")[0]

'50'


def refferals(x):
    try:
        num = re.findall("\d+", x)
        return int(num[0])
    except:
        print("failed to convert ", x)
rawResults["numRefferalsInt"] = rawResults.apply(lambda x: refferals(x.numReferrals), axis=1)

failed to convert  nan
failed to convert  nan
failed to convert  Three
failed to convert  nan
failed to convert  nan
failed to convert  nan
failed to convert  nan
failed to convert  nan
failed to convert  nan
failed to convert  nan
failed to convert  nan
failed to convert  nan
failed to convert  nan
failed to convert  nan
failed to convert  nan
failed to convert  nan
failed to convert  nan
failed to convert  nan
failed to convert  nan
failed to convert  nan
failed to convert  nan
failed to convert  nan
failed to convert  nan
failed to convert  nan
failed to convert  nan
failed to convert  nan
failed to convert  nan
failed to convert  nan
failed to convert  nan
failed to convert  nan


rawResults["nAppliedTo"] = rawResults.apply(lambda x: refferals(x.numAppliedTo), axis=1)


test = smf.ols(formula="numOffers ~ avgIntvPrepSpan", data=rawResults)
mod = test.fit()
mod.summary()


rawResults[["gotOffer", "prepHoursGrp", "avgIntvPrepHrs", "avgIntvPrepSpan"]].to_csv("offersAndPrepTime.csv")

	Unnamed: 0	Timestamp	expectedGraduation	priorInternship	returnOffer	rolesAppliedFor	firstApplication	lastApplicatoin	numAppliedTo	resumeSup	...	bigTechAssessment	bigTechOffers	avgIntvPrepTime	avgIntvPrepSpan	avgIntvPrepHrs	numOffers	avgInitialReply	avgAssessmentToFirstIntvTime	avgFirstToFinalIntvTime	avgIntvToOfferTime
0	0	11/25/2020 15:58	6/11/2022	Yes	No	Software Engineer Internship	7/1/2020	8/15/2020	150	LinkedIn, Personal GitHub, School / class proj...	...	1	1	10.0	10.0	40.0	3	3.0	1.5	2.500000	1.0
1	1	11/25/2020 16:15	6/10/2023	No	NaN	Software Engineer Internship, 1st / 2nd year i...	8/28/2020	10/30/2020	36	Personal projects	...	1	0	1.5	1.5	4.0	1	0.0	NaN	1.714286	1.0
2	2	11/25/2020 16:25	12/17/2022	No	NaN	Software Engineer Internship	8/26/2020	10/16/2020	165	LinkedIn, Personal GitHub, Personal website, P...	...	1	1	3.5	3.5	13.0	3	2.0	2.0	1.000000	1.0
3	3	11/25/2020 16:37	6/14/2022	Yes	Yes	Software Engineer Internship	8/20/2020	11/4/2020	70	LinkedIn, Personal GitHub, Personal projects	...	1	0	3.5	3.5	1.0	1	1.0	NaN	NaN	NaN
4	4	11/25/2020 17:18	3/31/2022	No	NaN	Software Engineer Internship	7/10/2020	11/19/2020	270	LinkedIn, Personal GitHub, Personal projects, ...	...	0	0	10.0	10.0	8.0	4	NaN	NaN	NaN	NaN

	avgIntvPrepHrs	avgIntvPrepSpan
bigTechOffers
0	4.838028	9.852113
1	7.525000	9.250000

	inclGithub	inclPersonalProj	inclSchoolProj	inclWebsite	inclLinkedIn
gotOffer
0.0	0.46	0.57	0.54	0.03	0.89
1.0	0.55	0.62	0.47	0.28	0.88

	inclGithub	inclPersonalProj	inclSchoolProj	inclWebsite	inclLinkedIn
gotOffer
0.0	15.32	18.92	18.02	0.90	29.73
1.0	36.94	41.44	31.53	18.92	58.56

	inclGithub	inclPersonalProj	inclSchoolProj	inclWebsite	inclLinkedIn
bigTechOffers
0	0.535211	0.619718	0.535211	0.15493	0.859155
1	0.500000	0.575000	0.425000	0.27500	0.925000

Analysis for POD's Internship Report.¶

The Code¶

Average prep time for respondents that received an offer vs respondents with no offers¶

Comparing which portfolio documents were included the most¶

Comparing interview prep resources¶

	leetCodePrep	crackingTheCodePrep	youtubePrep	mockInterviewPrep	hackerrankPrep	crackingThePmPrep	492Prep
gotOffer
0.0	91.89	56.76	40.54	40.54	40.54	0.00	0.00
1.0	95.95	48.65	39.19	32.43	32.43	1.35	1.35

	avgIntvPrepHrs	prepHoursGrp
0	40.0	15+
1	4.0	4-7
2	13.0	8-14
3	1.0	1-3
4	8.0	8-14
...	...	...
106	8.0	8-14
107	4.0	4-7
108	4.0	4-7
109	1.0	1-3
110	4.0	4-7

Dep. Variable:	numOffers	R-squared:	0.005
Model:	OLS	Adj. R-squared:	-0.004
Method:	Least Squares	F-statistic:	0.5414
Date:	Sun, 24 Jan 2021	Prob (F-statistic):	0.463
Time:	11:19:36	Log-Likelihood:	-187.40
No. Observations:	111	AIC:	378.8
Df Residuals:	109	BIC:	384.2
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	1.2175	0.166	7.342	0.000	0.889	1.546
avgIntvPrepSpan	0.0083	0.011	0.736	0.463	-0.014	0.031

Omnibus:	24.151	Durbin-Watson:	2.030
Prob(Omnibus):	0.000	Jarque-Bera (JB):	31.654
Skew:	1.166	Prob(JB):	1.34e-07
Kurtosis:	4.188	Cond. No.	19.5