import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px 
from sklearn.preprocessing import MinMaxScaler

df_raw = pd.read_csv('survey_data.csv', low_memory=False) # Read CSV into DataFrame
df_raw.head() # Show first 5 rows of DataFrame

df_raw.shape

(65437, 114)

df_raw.columns.to_list()

['ResponseId',
 'MainBranch',
 'Age',
 'Employment',
 'RemoteWork',
 'Check',
 'CodingActivities',
 'EdLevel',
 'LearnCode',
 'LearnCodeOnline',
 'TechDoc',
 'YearsCode',
 'YearsCodePro',
 'DevType',
 'OrgSize',
 'PurchaseInfluence',
 'BuyNewTool',
 'BuildvsBuy',
 'TechEndorse',
 'Country',
 'Currency',
 'CompTotal',
 'LanguageHaveWorkedWith',
 'LanguageWantToWorkWith',
 'LanguageAdmired',
 'DatabaseHaveWorkedWith',
 'DatabaseWantToWorkWith',
 'DatabaseAdmired',
 'PlatformHaveWorkedWith',
 'PlatformWantToWorkWith',
 'PlatformAdmired',
 'WebframeHaveWorkedWith',
 'WebframeWantToWorkWith',
 'WebframeAdmired',
 'EmbeddedHaveWorkedWith',
 'EmbeddedWantToWorkWith',
 'EmbeddedAdmired',
 'MiscTechHaveWorkedWith',
 'MiscTechWantToWorkWith',
 'MiscTechAdmired',
 'ToolsTechHaveWorkedWith',
 'ToolsTechWantToWorkWith',
 'ToolsTechAdmired',
 'NEWCollabToolsHaveWorkedWith',
 'NEWCollabToolsWantToWorkWith',
 'NEWCollabToolsAdmired',
 'OpSysPersonal use',
 'OpSysProfessional use',
 'OfficeStackAsyncHaveWorkedWith',
 'OfficeStackAsyncWantToWorkWith',
 'OfficeStackAsyncAdmired',
 'OfficeStackSyncHaveWorkedWith',
 'OfficeStackSyncWantToWorkWith',
 'OfficeStackSyncAdmired',
 'AISearchDevHaveWorkedWith',
 'AISearchDevWantToWorkWith',
 'AISearchDevAdmired',
 'NEWSOSites',
 'SOVisitFreq',
 'SOAccount',
 'SOPartFreq',
 'SOHow',
 'SOComm',
 'AISelect',
 'AISent',
 'AIBen',
 'AIAcc',
 'AIComplex',
 'AIToolCurrently Using',
 'AIToolInterested in Using',
 'AIToolNot interested in Using',
 'AINextMuch more integrated',
 'AINextNo change',
 'AINextMore integrated',
 'AINextLess integrated',
 'AINextMuch less integrated',
 'AIThreat',
 'AIEthics',
 'AIChallenges',
 'TBranch',
 'ICorPM',
 'WorkExp',
 'Knowledge_1',
 'Knowledge_2',
 'Knowledge_3',
 'Knowledge_4',
 'Knowledge_5',
 'Knowledge_6',
 'Knowledge_7',
 'Knowledge_8',
 'Knowledge_9',
 'Frequency_1',
 'Frequency_2',
 'Frequency_3',
 'TimeSearching',
 'TimeAnswering',
 'Frustration',
 'ProfessionalTech',
 'ProfessionalCloud',
 'ProfessionalQuestion',
 'Industry',
 'JobSatPoints_1',
 'JobSatPoints_4',
 'JobSatPoints_5',
 'JobSatPoints_6',
 'JobSatPoints_7',
 'JobSatPoints_8',
 'JobSatPoints_9',
 'JobSatPoints_10',
 'JobSatPoints_11',
 'SurveyLength',
 'SurveyEase',
 'ConvertedCompYearly',
 'JobSat']

df = df_raw[['Age', 'Employment', 'WorkExp', 'RemoteWork', 'CodingActivities', 'EdLevel',
             'YearsCode', 'DevType', 'Country', 'ConvertedCompYearly', 'LanguageHaveWorkedWith', 'LanguageWantToWorkWith',
            'LanguageAdmired', 'DatabaseHaveWorkedWith', 'DatabaseWantToWorkWith', 'PlatformHaveWorkedWith', 'PlatformWantToWorkWith',
            'WebframeHaveWorkedWith', 'WebframeWantToWorkWith', 'OpSysProfessional use', 'Industry', 'JobSat']]

df.head()  # Show first 5 rows of DataFrame

df.describe(include='all')

df.dtypes

Age                        object
Employment                 object
WorkExp                   float64
RemoteWork                 object
CodingActivities           object
EdLevel                    object
YearsCode                  object
DevType                    object
Country                    object
ConvertedCompYearly       float64
LanguageHaveWorkedWith     object
LanguageWantToWorkWith     object
LanguageAdmired            object
DatabaseHaveWorkedWith     object
DatabaseWantToWorkWith     object
PlatformHaveWorkedWith     object
PlatformWantToWorkWith     object
WebframeHaveWorkedWith     object
WebframeWantToWorkWith     object
OpSysProfessional use      object
Industry                   object
JobSat                    float64
dtype: object

df.isnull().sum()

Age                           0
Employment                    0
WorkExp                   35779
RemoteWork                10631
CodingActivities          10971
EdLevel                    4653
YearsCode                  5568
DevType                    5992
Country                    6507
ConvertedCompYearly       42002
LanguageHaveWorkedWith     5692
LanguageWantToWorkWith     9685
LanguageAdmired           14565
DatabaseHaveWorkedWith    15183
DatabaseWantToWorkWith    22879
PlatformHaveWorkedWith    23071
PlatformWantToWorkWith    30905
WebframeHaveWorkedWith    20276
WebframeWantToWorkWith    26902
OpSysProfessional use     12464
Industry                  36579
JobSat                    36311
dtype: int64

print(df.duplicated().sum())  # Sum the duplicate rows in the DataFrame
df = df.drop_duplicates() #Remove the duplicate rows.

2190

dfExp_Age = df[['Age', 'DatabaseHaveWorkedWith']] # Select relevant columns: Age and the databases people have worked with
dfExp_Age.loc[:,'DatabaseHaveWorkedWith'] = dfExp_Age['DatabaseHaveWorkedWith'].str.split(';') # Split the 'DatabaseHaveWorkedWith' into a list (separated by semicolons)
dfExp_Age = dfExp_Age.explode('DatabaseHaveWorkedWith') # Expand the list of databases into separate rows
Exp_Age_grouped = dfExp_Age.groupby(['Age', 'DatabaseHaveWorkedWith']).size().reset_index(name='count') # Group by Age and Database, then count the number of occurrences

Exp_Age_pivot_table = Exp_Age_grouped.pivot(index='Age', columns = 'DatabaseHaveWorkedWith', values='count').fillna(0) #Create a pivot table of Age and DatabaseHaveWorkedWith
filt = Exp_Age_pivot_table.sum().sort_values(ascending=False).head().index #Gets the top 5 columns with the highest total values from the pivot table.
Exp_Age_pivot_table = Exp_Age_pivot_table[filt]#Filters the pivot table to keep only the top 5 columns.
Exp_Age_pivot_table #Show the results

Exp_Age_pivot_table.plot(kind='bar', stacked=False, figsize=(10, 6), colormap='tab20c', edgecolor = 'black')#Plots a bar chart with custom style.

plt.title('Databases used by age ranges')  # Set plot title
plt.ylabel('Proportion')  # Set y-axis label
plt.xlabel('Age') # Set x-axis label
plt.legend(title='Database') #Adds a legend with the title 'Database'.
plt.tight_layout()  # Adjust layout spacing
plt.xticks(rotation=70)  # Rotate x-axis labels
plt.figtext(0.5, -0.05,'Figure 1. Databases used by age ranges', # Adds centered caption below the plot
            wrap=True, horizontalalignment='center', fontsize=10)
plt.show()  # Display the plot

country_counts = df['Country'].value_counts().reset_index()  # Get counts of unique values
country_counts.columns = ['Country', 'Developer_Count']#Renames columns to 'Country' and 'Developer_Count'.

fig = px.choropleth( #Creates a choropleth map of developers by country.
    country_counts,
    locations='Country',
    locationmode='country names',
    color='Developer_Count',
    color_continuous_scale='Viridis',
    title='Number of developers by country'
)
fig.add_annotation( # Adds centered caption below the plot
    text="Figure 2. Number of developers by country based on survey data.",
    xref="paper", yref="paper",
    x=0.5, y=-0.15,
    showarrow=False,
    font=dict(size=12),
    align="center"
)
fig.show()

topCountry = df['Country'].value_counts().head(10).index  # Show first 10 rows of DataFrame
topCountry
filtroCtryCCY = df[df['Country'].isin(topCountry)]  # Access DataFrame column


def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)  # Calculate specified quantile
    Q3 = df[column].quantile(0.75)  # Calculate specified quantile
    IQR = Q3 - Q1           #Calculates the interquartile range (IQR).
    lower = Q1 - 1.5 * IQR  #Calculates the lower bound for outliers.
    upper = Q3 + 1.5 * IQR  #Calculates the upper bound for outliers.

    df = df[(df[column] >= lower)
    & (df[column] <= upper)]#Filters data within the salary bounds to remove outliers.
    
    return df

filtroCtryCCY = remove_outliers(filtroCtryCCY, 'ConvertedCompYearly')  #Filters data within the salary bounds to remove outliers.


namereplace = {"United Kingdom of Great Britain and Northern Ireland":"UK", "United States of America":"USA"}
filtroCtryCCY.loc[:, 'Country'] = filtroCtryCCY['Country'].replace(namereplace)  # Replace DataFrame names

plt.figure(figsize=(12,7))  # Initialize new matplotlib figure

plt.title('Annual compensation in the top 10 countries with the most developers')  # Set plot title
sns.boxplot(x='Country', y='ConvertedCompYearly', data=filtroCtryCCY)  # Draw boxplot
sns.set(style='darkgrid')
plt.ylabel('Converted Compensation (Yearly)')  # Set y-axis label
plt.xticks(rotation=90)  # Rotate x-axis labels
plt.tight_layout()  # Adjust layout spacing
plt.figtext(0.5, -0.05,'Figure 3. Annual compensation in the top 10 countries with the most developers', # Adds centered caption below the plot
            wrap=True, horizontalalignment='center', fontsize=10)
plt.show()  # Display the plot

dfRemConv = remove_outliers(df, 'ConvertedCompYearly')  #Filters data within the salary bounds to remove outliers.

plt.figure(figsize=(10,5))  # Initialize new matplotlib figure
plt.title('Annual compensation by work modality')  # Set plot title
ax = sns.barplot(x='RemoteWork', y='ConvertedCompYearly', data=dfRemConv, color='brown', edgecolor='black')#Creates a brown barplot of salary by remote work status.
plt.xlabel('Work Modality') #Set x-axis label
plt.ylabel('Converted Compensation (Yearly)')  # Set y-axis label
max_y = dfRemConv.groupby('RemoteWork')['ConvertedCompYearly'].mean().max() #Finds the highest average salary by remote work group.
ax.set_ylim(0, max_y * 1.2) #Sets y-axis limit slightly above the max average salary.

for bar in ax.patches:
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 4500, f'{bar.get_height():.0f}', ha='center', va='bottom') #Adds value labels above each bar in the plot.

plt.figtext(0.5, -0.05,'Figure 4. Annual compensation by work modality', # Adds centered caption below the plot
            wrap=True, horizontalalignment='center', fontsize=10)
plt.tight_layout()  # Adjust layout spacing
plt.show()  # Display the plot

dfProgram = df[['LanguageHaveWorkedWith', 'Country']].dropna()  #Selects language and country columns, drops missing values.
dfProgram['LanguageHaveWorkedWith'] = dfProgram['LanguageHaveWorkedWith'].str.split(';') #Splits languages into lists by semicolon.
dfProgram = dfProgram.explode('LanguageHaveWorkedWith') #Expands list of languages into separate rows.

filtro = dfProgram['LanguageHaveWorkedWith'].value_counts().sort_values(ascending=False).head(10).index #Gets top 10 most used languages.

dfProgram_filter = dfProgram[dfProgram['LanguageHaveWorkedWith'].isin(filtro)] #Filters to keep only the top 10 languages
filtro2 = dfProgram_filter['Country'].value_counts().sort_values(ascending=False).head(20).index  #Gets top 20 countries by count.
dfProgram_filter = dfProgram_filter[dfProgram_filter['Country'].isin(filtro2)] #Filters to keep only the top 20 countries.

grupo = dfProgram_filter.groupby(['Country', 'LanguageHaveWorkedWith']).size().reset_index(name='count') #Groups by country and language, counts occurrences
grupo = grupo.replace(namereplace)  # Replace DataFrame names


LangPorcentage = dfProgram.groupby(['Country', 'LanguageHaveWorkedWith']).size().reset_index(name='count')
LangPorcentage = LangPorcentage[['LanguageHaveWorkedWith', 'count']].groupby('LanguageHaveWorkedWith').sum() \
.sort_values(['count'], ascending=False).reset_index()
TotalP = LangPorcentage['count'].sum()
Top1 = LangPorcentage.iloc[0,1]/TotalP * 100
Top10_total = (LangPorcentage['count'].iloc[:10].sum() / TotalP) * 100
print(f'\033[1mThe top 1 language is {LangPorcentage.iloc[0, 0]} with {Top1:.1f}% \
and the sum of the top 10 most used languages is {Top10_total:.1f}%\033[0m')
print('')

plt.figure(figsize=(15,10))  # Initialize new matplotlib figure
plt.scatter(
    grupo['LanguageHaveWorkedWith'],
    grupo['Country'],
    s=grupo['count'],
    c=grupo['count'],
    alpha = 0.5
)
plt.title('Bubble Plot for Languages Worked With Across Countries')  # Set plot title
plt.xlabel('Languages') # Set x-axis label
plt.ylabel('Country')  # Set y-axis label
plt.xticks(rotation = 45)  # Rotate x-axis labels
plt.tight_layout()  # Adjust layout spacing
plt.colorbar(label='Frequence') #Adds a colorbar labeled 'Frequence'.
plt.figtext(0.5, -0.05,'Figure 5. Bubble Plot for Languages Worked With Across Countries', # Adds centered caption below the plot
            wrap=True, horizontalalignment='center', fontsize=10)
plt.show()  # Display the plot

The top 1 language is JavaScript with 11.6% and the sum of the top 10 most used languages is 72.9%

dfLW = df[['LanguageWantToWorkWith', 'Country']].dropna() #Selects desired language and country columns, drops missing values.
dfLW['LanguageWantToWorkWith'] = dfLW['LanguageWantToWorkWith'].str.split(';') #Splits desired languages into lists by semicolon.
dfLW = dfLW.explode('LanguageWantToWorkWith') #Expands desired languages into separate rows.

filtroLanguages = dfLW['LanguageWantToWorkWith'].value_counts().sort_values(ascending=False).head(10).index #Gets top 10 desired languages.

dfLW = dfLW[dfLW['LanguageWantToWorkWith'].isin(filtroLanguages)] #Filters to keep only the top 10 desired languages.
dfLW = dfLW[dfLW['Country'].isin(filtro2)] #Filters to keep only the top 20 countries.

group = dfLW.groupby(['Country', 'LanguageWantToWorkWith']).size().reset_index(name='count').replace(namereplace) #Groups by country and desired language, counts, and replaces names.

plt.figure(figsize=(15,10))  # Initialize new matplotlib figure
plt.scatter(
    group['LanguageWantToWorkWith'],
    group['Country'],
    s=group['count'],
    c=group['count'],
    alpha = 0.5
)
plt.title('Bubble Plot for Databases Wanted Across Countries', fontsize=16)  # Set plot title
plt.xlabel('Languages') #Set x-axis label
plt.ylabel('Country')  # Set y-axis label
plt.xticks(rotation = 45)  # Rotate x-axis labels
plt.tight_layout()  # Adjust layout spacing
plt.colorbar(label='Frequence')
plt.figtext(0.5, -0.05,'Figure 6. Bubble Plot for Databases Wanted Across Countries', # Adds centered caption below the plot
            wrap=True, horizontalalignment='center', fontsize=13)
plt.show()  # Display the plot

dfExpSalarie = df[['WorkExp', 'ConvertedCompYearly']]
dfExpSalarie = dfExpSalarie.dropna()
dfExpSalarie_outliers = remove_outliers(dfExpSalarie, 'ConvertedCompYearly')  #Filters data within the salary bounds to remove outliers.
groupExpSalarie = dfExpSalarie_outliers.groupby('WorkExp')['ConvertedCompYearly'].mean().reset_index()

groupExpSalarie = groupExpSalarie.sort_values('WorkExp')

initial_salary = groupExpSalarie.loc[groupExpSalarie['WorkExp'] == 0, 'ConvertedCompYearly'].values[0]
salary_year_5 = groupExpSalarie.loc[groupExpSalarie['WorkExp'] == 5, 'ConvertedCompYearly'].values[0]

total_pct_increase = ((salary_year_5 - initial_salary) / initial_salary) * 100

print(f"Total salary increase from year 1 to year 5: {total_pct_increase:.2f}%")

sns.scatterplot(data=groupExpSalarie, x='WorkExp', y='ConvertedCompYearly')
plt.title('Work Experience vs Annual Salary')
plt.xlabel('Years of Experience')
plt.ylabel('Annual Salary (USD)')
plt.figtext(0.5, -0.05,'Figure 7. Work Experience vs Annual Salary', # Adds centered caption below the plot
            wrap=True, horizontalalignment='center', fontsize=9)
plt.show()

Total salary increase from year 1 to year 5: 152.03%

dfWeb_Frame = df['WebframeWantToWorkWith'].dropna().str.split(';') #Splits desired web frameworks into lists, drops missing values.
dfWeb_Frame = dfWeb_Frame.explode() #Expands desired web frameworks into separate rows.

top = dfWeb_Frame.value_counts() #Counts occurrences of each web framework.
top5 = top.head(10) #Selects the top 10 most wanted web frameworks.
others = top[10:].sum() #Sums counts of web frameworks outside the top 10.

labels = top5.index.tolist() + ['Others'] #Creates labels list with top 10 and 'Others'.
sizes = top5.values.tolist() + [others] #Creates sizes list with top 10 counts plus 'Others' sum.

plt.figure(figsize=(10, 6))  # Initialize new matplotlib figure
patches, texts, autotexts = plt.pie(
    sizes,
    labels=labels,
    autopct='%1.1f%%',
    startangle=140,
    wedgeprops={'edgecolor': 'black'}
)

for text in texts:
    text.set_fontsize(11)
for autotext in autotexts:
    autotext.set_fontsize(11)
    autotext.set_color('white')

plt.title('Pie Chart of Preferred Web Frameworks (Top 5 + Others)', fontsize=14)  # Set plot title
plt.axis('equal')  #Sets plot aspect ratio to be equal
plt.tight_layout()  # Adjust layout spacing
plt.legend(labels, loc='upper right') #Adds a legend with labels in the upper right corner.
plt.style.use('Solarize_Light2') #Applies the 'Solarize_Light2' plot style.
plt.figtext(0.5, -0.05,'Figure 8. Pie Chart of Preferred Web Frameworks', # Adds centered caption below the plot
            wrap=True, horizontalalignment='center', fontsize=10)
plt.show()  # Display the plot

dfAC = remove_outliers(df, 'ConvertedCompYearly')  #Filters data within the salary bounds to remove outliers.

dfAC['Age'].unique() #Gets unique ages in the filtered data.

array(['18-24 years old', '25-34 years old', '35-44 years old',
       '45-54 years old', '55-64 years old', '65 years or older',
       'Under 18 years old', 'Prefer not to say'], dtype=object)

dfAC = dfAC[~dfAC['Age'].isin(['Prefer not to say', 'Under 18 years old'])].copy() #Removes entries with 'Prefer not to say' or 'Under 18 years old' ages.
dfAC['Age'] = dfAC['Age'].str.replace(r' years old| years or older', '', regex=True) #Removes text like 'years old' from the Age values.
dfAC['Age'] = dfAC['Age'].str.split('-') #Splits Age values into lists by hyphen.
dfAC['Age'] = dfAC['Age'].apply(lambda x: sum([float(i) for i in x]) / len(x) if isinstance(x, list) else np.nan) #Calculates average age from age ranges.

dfAC = dfAC.dropna(subset=['Age', 'ConvertedCompYearly'])
group2 = dfAC.groupby('Age')['ConvertedCompYearly'].mean().reset_index() #Calculates average salary by age group.

fig = px.line(group2,        #Creates a line plot of average salary by age with markers and labels.
              x='Age',
              y='ConvertedCompYearly',
              markers=True,
              title='Relationship Between Age and Converted Yearly Compensation',
              labels={
                  'Age': 'Age',
                  'ConvertedCompYearly': 'Converted Compensation (Yearly)'
              })

fig.update_layout(       #Updates plot layout with size and font settings.
    width = 900,
    height = 400,
    title_font_size=18,
    xaxis_title_font_size=14,
    yaxis_title_font_size=14,
    margin=dict(t=80, b=80)

)

fig.add_annotation(
    text="Figure 9. Relationship Between Age and Converted Yearly Compensation", # Adds centered caption below the plot
    xref="paper", yref="paper",
    x=0.5, y=-.43,
    font=dict(size=12),
    align="center"
)
fig.show()

dfEd = remove_outliers(df, 'ConvertedCompYearly')  #Filters data within the salary bounds to remove outliers.
dfEd = dfEd.dropna(subset=['EdLevel', 'ConvertedCompYearly'])
group3 = dfEd.groupby('EdLevel')['ConvertedCompYearly'].mean().reset_index() #Calculates average salary by education level.

Edlvlnames = {
    'Associate degree (A.A., A.S., etc.)': 'Associate degree',
    'Bachelor’s degree (B.A., B.S., B.Eng., etc.)': 'Bachelor’s degree',
    'Master’s degree (M.A., M.S., M.Eng., MBA, etc.)': 'Master’s degree',
    'Some college/university study without earning a degree': 'Some college, no degree',
    'Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)':'Secondary school',
    'Professional degree (JD, MD, Ph.D, Ed.D, etc.)':'Professional degree'

}
group3 = group3.replace(Edlvlnames)  # Replace DataFrame values

fig = px.line(group3,       #Creates a line plot of average salary by education level with markers and labels.
              x='EdLevel',
              y='ConvertedCompYearly',
              markers=True,
              title='Relationship Between Education Level and Converted Yearly Compensation',
              labels={
                  'EdLevel': 'Education Level',
                  'ConvertedCompYearly': 'Converted Compensation (Yearly)'
              })

high = group3[group3['EdLevel'] == 'Professional degree']['ConvertedCompYearly'].values[0]
low = group3[group3['EdLevel'] == 'Secondary school']['ConvertedCompYearly'].values[0]

# Calcular el porcentaje
diff_percent = ((high - low) / low) * 100

print(f"Professional degree holders earn {diff_percent:.1f}% more than those with only secondary education.")


fig.update_layout(   #Updates plot layout with size, fonts, and rotated x-axis labels.
    width=1300,
    height=500,
    title_font_size=18,
    xaxis_title_font_size=14,
    yaxis_title_font_size=14,
    xaxis_tickangle=270,
    margin=dict(t=80, b=210)
)
fig.add_annotation( # Adds centered caption below the plot
    text="Figure 10. Relationship Between Education Level and Converted Yearly Compensation",
    xref="paper", yref="paper",
    x=0.5, y=-1.1,
    font=dict(size=12),
    align="center"
)
fig.show()

Professional degree holders earn 57.4% more than those with only secondary education.

dfC = df[['YearsCode', 'ConvertedCompYearly', 'JobSat']] #Selects years coding, salary, and job satisfaction columns.
dfC = remove_outliers(dfC, 'ConvertedCompYearly')  #Filters data within the salary bounds to remove outliers.
dfC = dfC.dropna(subset=['YearsCode', 'ConvertedCompYearly', 'JobSat'])
print(dfC['YearsCode'].unique()) #Gets unique values in the YearsCode column.
dfC['YearsCode'] = dfC['YearsCode'].str.extract(r'(\d+)') #Extracts the number of years coded from the text.

['3' '15' '7' '32' '38' '21' '10' '6' '40' '20' '9' '25' '12' '14' '11'
 '16' '18' '28' '37' '30' '47' '17' '36' '19' '24' '2' '5' '34' '23' '26'
 '13' '33' '31' '45' '22' '4' '35' '27' '48' '8' '42' '29'
 'More than 50 years' '39' '43' '44' '50' '1' 'Less than 1 year' '46' '41'
 '49']

scaler = MinMaxScaler() #Initializes a MinMaxScaler for normalization.
dfC_scaled = pd.DataFrame(scaler.fit_transform(dfC), columns=dfC.columns) #Scales all dataframe columns to a 0-1 range.

dfC_grouped = dfC_scaled.groupby('YearsCode').mean().reset_index() #Calculates average scaled values grouped by years coding.

plt.figure(figsize=(10, 6))  # Initialize new matplotlib figure
plt.scatter(dfC_grouped['YearsCode'], dfC_grouped['ConvertedCompYearly'], c=dfC_grouped['JobSat'], cmap='viridis', s=100) #Creates a scatter plot of salary vs. years coded, colored by job satisfaction.

plt.xlabel('Years of Coding (Normalized)') # Set x-axis label
plt.ylabel('Yearly Compensation (Normalized)') # Set y-axis label
plt.title('Comparison of Years of Coding, Compensation, and Job Satisfaction (Normalized)')  # Set plot title

plt.colorbar(label='Job Satisfaction (Normalized)') #Adds a colorbar.
plt.figtext(0.5, -0.05,'Figure 11. Comparison of Years of Coding, Compensation, and Job Satisfaction', # Adds centered caption below the plot
            wrap=True, horizontalalignment='center', fontsize=10)
plt.show()  # Display the plot

	ResponseId	MainBranch	Age	Employment	RemoteWork	Check	CodingActivities	EdLevel	LearnCode	LearnCodeOnline	...	JobSatPoints_6	JobSatPoints_7	JobSatPoints_8	JobSatPoints_9	JobSatPoints_10	JobSatPoints_11	SurveyLength	SurveyEase	ConvertedCompYearly	JobSat
0	1	I am a developer by profession	Under 18 years old	Employed, full-time	Remote	Apples	Hobby	Primary/elementary school	Books / Physical media	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	2	I am a developer by profession	35-44 years old	Employed, full-time	Remote	Apples	Hobby;Contribute to open-source projects;Other...	Bachelor’s degree (B.A., B.S., B.Eng., etc.)	Books / Physical media;Colleague;On the job tr...	Technical documentation;Blogs;Books;Written Tu...	...	0.0	0.0	0.0	0.0	0.0	0.0	NaN	NaN	NaN	NaN
2	3	I am a developer by profession	45-54 years old	Employed, full-time	Remote	Apples	Hobby;Contribute to open-source projects;Other...	Master’s degree (M.A., M.S., M.Eng., MBA, etc.)	Books / Physical media;Colleague;On the job tr...	Technical documentation;Blogs;Books;Written Tu...	...	NaN	NaN	NaN	NaN	NaN	NaN	Appropriate in length	Easy	NaN	NaN
3	4	I am learning to code	18-24 years old	Student, full-time	NaN	Apples	NaN	Some college/university study without earning ...	Other online resources (e.g., videos, blogs, f...	Stack Overflow;How-to videos;Interactive tutorial	...	NaN	NaN	NaN	NaN	NaN	NaN	Too long	Easy	NaN	NaN
4	5	I am a developer by profession	18-24 years old	Student, full-time	NaN	Apples	NaN	Secondary school (e.g. American high school, G...	Other online resources (e.g., videos, blogs, f...	Technical documentation;Blogs;Written Tutorial...	...	NaN	NaN	NaN	NaN	NaN	NaN	Too short	Easy	NaN	NaN

	Age	Employment	WorkExp	RemoteWork	CodingActivities	EdLevel	YearsCode	DevType	Country	ConvertedCompYearly	...	LanguageAdmired	DatabaseHaveWorkedWith	DatabaseWantToWorkWith	PlatformHaveWorkedWith	PlatformWantToWorkWith	WebframeHaveWorkedWith	WebframeWantToWorkWith	OpSysProfessional use	Industry	JobSat
0	Under 18 years old	Employed, full-time	NaN	Remote	Hobby	Primary/elementary school	NaN	NaN	United States of America	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	35-44 years old	Employed, full-time	17.0	Remote	Hobby;Contribute to open-source projects;Other...	Bachelor’s degree (B.A., B.S., B.Eng., etc.)	20	Developer, full-stack	United Kingdom of Great Britain and Northern I...	NaN	...	Bash/Shell (all shells);Go;HTML/CSS;Java;JavaS...	Dynamodb;MongoDB;PostgreSQL	PostgreSQL	Amazon Web Services (AWS);Heroku;Netlify	Amazon Web Services (AWS);Heroku;Netlify	Express;Next.js;Node.js;React	Express;Htmx;Node.js;React;Remix	MacOS	NaN	NaN
2	45-54 years old	Employed, full-time	NaN	Remote	Hobby;Contribute to open-source projects;Other...	Master’s degree (M.A., M.S., M.Eng., MBA, etc.)	37	Developer Experience	United Kingdom of Great Britain and Northern I...	NaN	...	C#	Firebase Realtime Database	Firebase Realtime Database	Google Cloud	Google Cloud	ASP.NET CORE	ASP.NET CORE	Windows	NaN	NaN
3	18-24 years old	Student, full-time	NaN	NaN	NaN	Some college/university study without earning ...	4	Developer, full-stack	Canada	NaN	...	HTML/CSS;Java;JavaScript;PowerShell;Python;SQL...	MongoDB;MySQL;PostgreSQL;SQLite	MongoDB;MySQL;PostgreSQL	Amazon Web Services (AWS);Fly.io;Heroku	Amazon Web Services (AWS);Vercel	jQuery;Next.js;Node.js;React;WordPress	jQuery;Next.js;Node.js;React	NaN	NaN	NaN
4	18-24 years old	Student, full-time	NaN	NaN	NaN	Secondary school (e.g. American high school, G...	9	Developer, full-stack	Norway	NaN	...	C++;HTML/CSS;JavaScript;Lua;Python	PostgreSQL;SQLite	PostgreSQL;SQLite	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	Age	Employment	WorkExp	RemoteWork	CodingActivities	EdLevel	YearsCode	DevType	Country	ConvertedCompYearly	...	LanguageAdmired	DatabaseHaveWorkedWith	DatabaseWantToWorkWith	PlatformHaveWorkedWith	PlatformWantToWorkWith	WebframeHaveWorkedWith	WebframeWantToWorkWith	OpSysProfessional use	Industry	JobSat
count	65437	65437	29658.000000	54806	54466	60784	59869	59445	58930	2.343500e+04	...	50872	50254	42558	42366	34532	45161	38535	52973	28858	29126.000000
unique	8	110	NaN	3	118	8	52	34	185	NaN	...	12335	9050	8478	5467	4784	12235	11654	2032	15	NaN
top	25-34 years old	Employed, full-time	NaN	Hybrid (some remote, some in-person)	Hobby	Bachelor’s degree (B.A., B.S., B.Eng., etc.)	10	Developer, full-stack	United States of America	NaN	...	Python	PostgreSQL	PostgreSQL	Amazon Web Services (AWS)	Amazon Web Services (AWS)	React	React	Windows	Software Development	NaN
freq	23911	39041	NaN	23015	9993	24942	4561	18260	11095	NaN	...	1555	3216	3738	6606	4859	1284	997	10472	11918	NaN
mean	NaN	NaN	11.466957	NaN	NaN	NaN	NaN	NaN	NaN	8.615529e+04	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	6.935041
std	NaN	NaN	9.168709	NaN	NaN	NaN	NaN	NaN	NaN	1.867570e+05	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2.088259
min	NaN	NaN	0.000000	NaN	NaN	NaN	NaN	NaN	NaN	1.000000e+00	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.000000
25%	NaN	NaN	4.000000	NaN	NaN	NaN	NaN	NaN	NaN	3.271200e+04	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	6.000000
50%	NaN	NaN	9.000000	NaN	NaN	NaN	NaN	NaN	NaN	6.500000e+04	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	7.000000
75%	NaN	NaN	16.000000	NaN	NaN	NaN	NaN	NaN	NaN	1.079715e+05	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	8.000000
max	NaN	NaN	50.000000	NaN	NaN	NaN	NaN	NaN	NaN	1.625660e+07	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	10.000000

DatabaseHaveWorkedWith	PostgreSQL	MySQL	SQLite	Microsoft SQL Server	MongoDB
Age
18-24 years old	5262.0	5204.0	4061.0	2137.0	3615.0
25-34 years old	10120.0	7707.0	6140.0	4470.0	5123.0
35-44 years old	6279.0	4507.0	3952.0	3559.0	2546.0
45-54 years old	2458.0	2049.0	1684.0	1987.0	939.0
55-64 years old	780.0	757.0	630.0	807.0	266.0
65 years or older	127.0	219.0	177.0	160.0	46.0
Prefer not to say	61.0	73.0	73.0	43.0	35.0
Under 18 years old	448.0	583.0	648.0	112.0	437.0

Exploratory Data Analysis of Developer Survey Results¶

Dataset Description¶

Objective¶

Key Findings¶

Import required libraries¶

Initial Data Loading and Cleaning¶

Load CSV data into DataFrame¶

Returns the number of rows and columns in the DataFrame.¶

Returns a list of column names in the DataFrame.¶

Creates a new DataFrame df by selecting specific columns¶

Display first rows of the new DataFrame¶

Generates descriptive statistics for all columns¶

Returns the data type of each column in the DataFrame.¶

Check for missing values in DataFrame¶

Counts and prints the number of duplicate rows in the DataFrame, then removes them.¶

Processing Database Experience by Age¶

Pivot Table: Age vs. Database Experience¶

Bar Chart: Top 5 Databases Used by Age¶

Conclusions: Database Usage by Age Group¶

Choropleth Map: Number of Developers by Country¶

Conclusions: Number of Developers by Country¶

Analyzes annual compensation in the top 10 countries with the most developers¶

Boxplot of Annual Compensation in the Top 10 Countries¶

Conclusions: Annual Compensation in the Top 10 Countries with the Most Developers¶

Bar Plot: Annual Compensation by Work Modality¶

Conclusions: Annual Compensation by Work Modality¶

Analysis and Visualization of Programming Languages Used Across Countries¶

Conclusions: Languages Worked With Across Countries¶

Analysis of Desired Programming Languages Across Countries¶

Bubble Plot: Desired Programming Languages Across Countries¶

Conclusions: Databases Wanted Across Countries¶

Relationship Between Work Experience and Annual Salary¶

Conclusions: Work Experience vs Annual Salary¶

Analysis of Desired Web Frameworks¶

Pie Chart: Preferred Web Frameworks (Top 10 + Others)¶

Conclusions: Preferred Web Frameworks¶

Line Plot: Average Annual Compensation by Age Group¶

Cleaning and Transforming Age Data for Salary Analysis¶

Line Plot: Relationship Between Age and Annual Compensation¶

Conclusions: Age vs Annual Compensation¶

Average Salary by Education Level¶

Line Plot: Education Level vs Annual Compensation¶

Conclusions: Education Level vs Annual Compensation¶

Scatter Plot: Normalized Compensation vs. Coding Experience Colored by Job Satisfaction¶

Normalizing Data Using Min-Max Scaling¶

Scatter Plot: Years of Coding vs. Salary Colored by Job Satisfaction (Normalized)¶

Conclusions: Coding Experience vs Compensation & Job Satisfaction (Normalized)¶

FINAL CONCLUSIONS¶

Creates a new DataFrame `df` by selecting specific columns¶