Data Science Job search

Have you ever spent a lot of time looking for the “right” job? Spending more than “a little” time each day scrolling through LinkedIn? There are many variables that can affect someone’s stability while they are looking for work, such as financial compensation, ability to relocate, and technical advancement. For instance, a person may decide to look for work in a particular location if there are more data science jobs there than elsewhere. San Francisco is one of many large cities with a significant number of tech companies, startups, and research institutions that employ data scientists.

Another is the cost of living: The cost of living in a particular area may have a big impact on a person’s job search. It’s important to consider the cost of living when comparing job offers for data scientists because their salaries may vary depending on where they work.Access to Networking Opportunities: The availability of networking opportunities may have an impact on a person’s job search. The number of data science conferences, meetups, and networking events may increase in larger metropolitan areas, increasing the opportunity to network with peers and find employment opportunities.

Finally, there are numerous sub-specialties within data science, such as machine learning, reinforcement learning, and natural language processing. The field of data science is vast. Due to a higher demand for a specific sub-specialization, it might be easier to find job openings in that sub-specialization in some locations. Therefore, the purpose of this website is to offer a customized user experience to make a Data Science job search simpler.

Help the student get an offer!

Slide the student to the offer to discover…

Did you know that selecting a field in your career that you are passionate about will decrease your job search and recruiters will most likely find better offers for you? Let’s dive into a general view of companies by area and the number of positions they offer.

Which companies provide more positions by area?

click on each of the categories to find out the companies that have roles for that area and click on the upper title to go back

Code

chart = {
  const x = d3.scaleLinear().rangeRound([0, width]);
  const y = d3.scaleLinear().rangeRound([0, height]);
  const color = d3.scaleOrdinal(d3.schemeSet3);
  const svg = d3.create("svg")
      .attr("viewBox", [0.5, -30.5, width, height + 30])
      .style("font", "14px sans-serif");

  let group = svg.append("g")
      .call(render, treemap(data));

  function render(group, root) {
    const node = group
      .selectAll("g")
      .data(root.children.concat(root))
      .join("g");

    node.filter(d => d === root ? d.parent : d.children)
        .attr("cursor", "pointer")
        .on("click", (event, d) => d === root ? zoomout(root) : zoomin(d));

    node.append("title")
        .text(d => `${name(d)}\n${format(d.value)}`);

    node.append("rect")
        .attr("id", d => (d.leafUid = DOM.uid("leaf")).id)
        .attr("fill", d => d === root ? "#fff" : d.children ? color(name(d)) : color(d.parent.data.name))
        .attr("stroke", "#fff");

    node.append("clipPath")
        .attr("id", d => (d.clipUid = DOM.uid("clip")).id)
      .append("use")
        .attr("xlink:href", d => d.leafUid.href);

    node.append("text")
        .attr("clip-path", d => d.clipUid)
        .attr("font-weight", d => d === root ? "bold" : null)
      .selectAll("tspan")
      .data(d => (d === root ? name(d) : d.data.name).split(/(?=[A-Z][^A-Z])/g).concat(format(d.value)))
      .join("tspan")
        .attr("x", 3)
        .attr("y", (d, i, nodes) => `${(i === nodes.length - 1) * 0.3 + 1.1 + i * 0.9}em`)
        .attr("fill-opacity", (d, i, nodes) => i === nodes.length - 1 ? 0.7 : null)
        .attr("font-weight", (d, i, nodes) => i === nodes.length - 1 ? "normal" : null)
        .text(d => d);

    group.call(position, root);
  }

  function position(group, root) {
    group.selectAll("g")
        .attr("transform", d => d === root ? `translate(0,-30)` : `translate(${x(d.x0)},${y(d.y0)})`)
      .select("rect")
        .attr("width", d => d === root ? width : x(d.x1) - x(d.x0))
        .attr("height", d => d === root ? 30 : y(d.y1) - y(d.y0));
  }

  // When zooming in, draw the new nodes on top, and fade them in.
  function zoomin(d) {
    const group0 = group.attr("pointer-events", "none");
    const group1 = group = svg.append("g").call(render, d);

    x.domain([d.x0, d.x1]);
    y.domain([d.y0, d.y1]);

    svg.transition()
        .duration(750)
        .call(t => group0.transition(t).remove()
          .call(position, d.parent))
        .call(t => group1.transition(t)
          .attrTween("opacity", () => d3.interpolate(0, 1))
          .call(position, d));
  }

  // When zooming out, draw the old nodes on top, and fade them out.
  function zoomout(d) {
    const group0 = group.attr("pointer-events", "none");
    const group1 = group = svg.insert("g", "*").call(render, d.parent);

    x.domain([d.parent.x0, d.parent.x1]);
    y.domain([d.parent.y0, d.parent.y1]);

    svg.transition()
        .duration(750)
        .call(t => group0.transition(t).remove()
          .attrTween("opacity", () => d3.interpolate(1, 0))
          .call(position, d))
        .call(t => group1.transition(t)
          .call(position, d.parent));
  }

  return svg.node();
}
data = FileAttachment("output.json").json()
treemap = data => d3.treemap()
    .tile(tile)
  (d3.hierarchy(data)
    .sum(d => d.value)
    .sort((a, b) => b.value - a.value))

function tile(node, x0, y0, x1, y1) {
  d3.treemapBinary(node, 0, 0, width, height);
  for (const child of node.children) {
    child.x0 = x0 + child.x0 / width * (x1 - x0);
    child.x1 = x0 + child.x1 / width * (x1 - x0);
    child.y0 = y0 + child.y0 / height * (y1 - y0);
    child.y1 = y0 + child.y1 / height * (y1 - y0);
  }
}

name = d => d.ancestors().reverse().map(d => d.data.name).join("/")

width = 954
height = 924
format = d3.format(",d")
d3 = require("d3@6")

The past plot shows how many companies provide offers for multiple fields meanwhile some others are more focused on certain ones. As mentioned since the beginning the location of a job is another key factor in a role decision, therefore we want to explore which companies provide job offers across the country (or remotely) by the top field we are exploring.

Which companies provide more offers on different locations?

Data science offers across companies locations

Code

import plotly.graph_objects as go
import pandas as pd

df = pd.read_csv("..\\data\\df_clean.csv")
df_slice = df.query("Area == 'data_scientist_'")
counts = df_slice.groupby('Company')['Location'].count()

fig = go.Figure(data=[go.Bar(x=counts.index, y=counts.values,marker=dict(color='#E6E6FA'))])
font_dict=dict(family='Arial',size=14,color='white')
fig.update_layout(title='Number of locations where companies provide offers in Data Science',
                  xaxis_title='Companies',yaxis_title='Number of Locations',
                  paper_bgcolor='rgba(0,0,0,0)',plot_bgcolor='rgba(0,0,0,0)',
                  template='plotly_dark',font=font_dict)
fig.show()

After visualizing the diversity in certain companies locations we can get a better overview of which companies provide more opportunities, to deep dive into the locations of each company the next graph provides their states and the amount of job offers, in the overall categories.

Where are the companies located and how many positions are posted by state?

hover over the map to visualize the number of positions by state as well as the companies located on that position

Code

import plotly.graph_objects as go
import pandas as pd 

df_states = pd.read_csv("..\\data\\companyperstate.csv")
fig = go.Figure(data=go.Choropleth(
    locations=df_states['state'], # Spatial coordinates
    z = df_states['positions'].astype(int), # Data to be color-coded
    locationmode = 'USA-states', # set of locations match entries in `locations`
    colorscale = 'blues',
    colorbar_title = "Number of positions",
    text=df_states['company_names'], # hover text
    marker_line_color='black'
))
font_dict=dict(family='Arial',
               size=14,
               color='white'
               )

fig.update_layout(
    title_text = 'Number of Data Science positions by state',
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    template='plotly_dark',
    geo_scope='usa',
    font=font_dict,
    width=750, height=500
)
fig.show()

Finally the most important objective of this site is to reduce the time a person can spend searching for positions and define their area of expertise, through the next visualizations we will take a look at the the offer’s responsibilities.

Which are the responsibilities companies ask across all fields?

the next wordcloud contains the most frequent words in the responsibilities sentences the companies have shared

Code

d3Cloud = require("d3-cloud@1")
import {howto} from "@d3/example-components"

function WordCloud(title,text, {
  size = group => group.length, // Given a grouping of words, returns the size factor for that word
  word = d => d, // Given an item of the data array, returns the word
  marginTop = 0, // top margin, in pixels
  marginRight = 0, // right margin, in pixels
  marginBottom = 0, // bottom margin, in pixels
  marginLeft = 0, // left margin, in pixels
  width = 640, // outer width, in pixels
  height = 400, // outer height, in pixels
  maxWords = 250, // maximum number of words to extract from the text
  fontFamily = "sans-serif", // font family
  fontScale = 30, // base font size
  padding = 0, // amount of padding between the words (in pixels)
  rotate = 0, // a constant or function to rotate the words
  invalidation // when this promise resolves, stop the simulation
} = {}) {
  const words = typeof text === "string" ? text.split(/\W+/g) : Array.from(text);
  
  const data = d3.rollups(words, size, w => w)
    .sort(([, a], [, b]) => d3.descending(a, b))
    .slice(0, maxWords)
    .map(([key, size]) => ({text: word(key), size}));
  
  const svg = d3.create("svg")
      .attr("viewBox", [0, 0, width, height])
      .attr("width", width)
      .attr("font-family", fontFamily)
      .attr("text-anchor", "middle")
      .attr("fill", " #E6E6FA") 
      .attr("style", "max-width: 100%; height: auto; height: intrinsic;")
      .text(title);

  const g = svg.append("g").attr("transform", `translate(${marginLeft},${marginTop})`);

  const cloud = d3Cloud()
      .size([width - marginLeft - marginRight, height - marginTop - marginBottom])
      .words(data)
      .padding(padding)
      .rotate(rotate)
      .font(fontFamily)
      .fontSize(d => Math.sqrt(d.size) * fontScale)
      .on("word", ({size, x, y, rotate, text}) => {
        g.append("text")
            .attr("font-size", size)
            .attr("transform", `translate(${x},${y}) rotate(${rotate})`)
            .text(text);
      });

  cloud.start();
  invalidation && invalidation.then(() => cloud.stop());
  return svg.node();
}
WordCloud("software data cloud design development requirements based system big components computing technical solutions provides technologies new business architecture team distributed blockchain data team design develop software business development solutions work new ensure teams support technical security project requirements implement quality data business analysis support work team develop management provide reports reporting analytics perform ensure development tools including information technical processes data business analysis learning team develop models work support machine development analytics solutions new provide techniques tools teams product design new software data time series model analytics models lead team algorithm technical ai learning scientists opera projects development ml methods performance develop data product performance model models work time analysis team ai business market ml develop teams methods working engineering projects series ", {
  width: 250,
  height: 100,
  size: () => .3 + Math.random(),
  rotate: () => (~~(Math.random() * 6) - 3) * 30
})

This is a useful visualization since most of the responsibilities such as design, development, business and so many others are required across different areas. The next time you are looking for a job, no matter the field, remember to be knowledgeable of the most asked responsibilities!

Finally, we want to reduce the time you spend by looking at uninteresting positions, therefore, the next graph will provide which is the best Data Science field for you, based on the similarity of your resume and the responsibilities of the field’s positions.

Given that we do not have a server to store your resume the next procedure will be performed with a built-in resume, the code is functional if another resume is uploaded, try the next code locally to find out yours. (similarity is calculated using a transformer and cosine similarity)

Which is my best field “fit”?

Code

import plotly.graph_objects as go
import pandas as pd 
from sentence_transformers import SentenceTransformer, util
from PyPDF2 import PdfReader

#insert your resume file name
resume_file_name = 'resume.pdf'
reader = PdfReader(resume_file_name)
page = reader.pages[0]
resume_text = page.extract_text()

# loading responsibilities per field
words_freq = pd.read_csv("..\\data\\words_freq.csv")
words_freq=words_freq.drop(columns=["Unnamed: 0"])

# loading transformer to compute embeddings
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
columns = words_freq.columns
embeddings = []
for column in columns:
  sentence = words_freq[column][0]
  embeddings.append(model.encode(sentence, convert_to_tensor=True))

input_user = model.encode(resume_text, convert_to_tensor=True)

responses = {}

for i in range(1,len(columns)):
  key_clean = columns[i].replace("_"," ")
  responses[key_clean] =  (util.pytorch_cos_sim(embeddings[i], input_user)[0][0].item())*100

responses
print("the similarity between your resume and the responsibilities of jobs per field are:")
max = 0
selec_clean = ""

for i in responses.keys():
  round_v = round(responses[i],2)
  print(i,": ",round_v,"%")
  if round_v > max: 
    max = round_v
    selec_clean = i

selection = selec_clean.replace(" ","_")

df_clean = pd.read_csv("..\\data\\df_clean.csv")
df_slice = df_clean.query("Area == '"+selection+"'")
df_slice.drop(columns=["Area"])

fig = go.Figure(data=[go.Table(
    header=dict(values=["Job Title","Company","Location"],
                fill_color='#edecaf',
                align='left'),
    cells=dict(values=[df_slice.Job, df_slice.Company, df_slice.Location],
               fill_color='lavender',
               align='left'))
])
font_dict=dict(family='Arial',
               size=12,
               color='black'
               )

title = "Positions in the "+selec_clean+"field:"
fig.update_layout(
    title_text = title,
    title_font_color="white",
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    geo_scope='usa',
    font=font_dict,
    width=750, height=500
)
fig.show()

the similarity between your resume and the responsibilities of jobs per field are:
block chain  :  10.37 %
data analyst  :  10.98 %
data scientist  :  15.76 %
deep learning  :  18.99 %
machine learning  :  23.11 %
natural language processing  :  24.47 %
neural networks  :  22.06 %
reinforcement learning  :  21.84 %
time series analysis  :  23.78 %
time series  :  19.72 %

Another example of resume to evaluate which field responsibilities are similar!

Code

import plotly.graph_objects as go
import pandas as pd 
from sentence_transformers import SentenceTransformer, util
from PyPDF2 import PdfReader

#insert your resume file name
resume_file_name = 'resume2.pdf'
reader = PdfReader(resume_file_name)
page = reader.pages[0]
resume_text = page.extract_text()

# loading responsibilities per field
words_freq = pd.read_csv("..\\data\\words_freq.csv")
words_freq=words_freq.drop(columns=["Unnamed: 0"])

# loading transformer to compute embeddings
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
columns = words_freq.columns
embeddings = []
for column in columns:
  sentence = words_freq[column][0]
  embeddings.append(model.encode(sentence, convert_to_tensor=True))

input_user = model.encode(resume_text, convert_to_tensor=True)

responses = {}

for i in range(1,len(columns)):
  key_clean = columns[i].replace("_"," ")
  responses[key_clean] =  (util.pytorch_cos_sim(embeddings[i], input_user)[0][0].item())*100

responses
print("the similarity between your resume and the responsibilities of jobs per field are:")
max = 0
selec_clean = ""

for i in responses.keys():
  round_v = round(responses[i],2)
  print(i,": ",round_v,"%")
  if round_v > max: 
    max = round_v
    selec_clean = i

selection = selec_clean.replace(" ","_")

df_clean = pd.read_csv("..\\data\\df_clean.csv")
df_slice = df_clean.query("Area == '"+selection+"'")
df_slice.drop(columns=["Area"])

fig = go.Figure(data=[go.Table(
    header=dict(values=["Job Title","Company","Location"],
                fill_color='#edecaf',
                align='left'),
    cells=dict(values=[df_slice.Job, df_slice.Company, df_slice.Location],
               fill_color='lavender',
               align='left'))
])
font_dict=dict(family='Arial',
               size=12,
               color='black'
               )

title = "Positions in the "+selec_clean+"field:"
fig.update_layout(
    title_text = title,
    title_font_color="white",
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    geo_scope='usa',
    font=font_dict,
    width=750, height=500
)
fig.show()

the similarity between your resume and the responsibilities of jobs per field are:
block chain  :  39.98 %
data analyst  :  40.54 %
data scientist  :  43.0 %
deep learning  :  49.62 %
machine learning  :  48.84 %
natural language processing  :  44.22 %
neural networks  :  44.54 %
reinforcement learning  :  45.95 %
time series analysis  :  41.67 %
time series  :  31.71 %

We hope this is useful and has helped you to reduce your time while exploring job offer options!