Assignment 1 - pburkard88/DS_BOS_07 GitHub Wiki
##Lahman Baseball Dataset
Download and set up MySQL server Open-source SQL DB:
Download and set up MySQL Workbench:
Download the Baseball DB and import the schema:
-
Download and unzip this script
-
Open MySQL Workbench and connect your local DB (localhost)
-
File -> Run SQL Script...
-
Select the BDB-sql-2009-11-25.sql file you unzipped, and enter 'baseball' as the Default Schema Name.
-
Click Run
-
Once the script is done running, click on the small refresh icon next to SCHEMAS on the left sidebar of MySQL Workbench. You should now see the baseball schema (database), complete with all its tables.
##Queries Use the appropriate SQL queries to find answers to the following questions:
- Find all of the Triple Crown (Award) winners ever in Major League Baseball, with their complete batting stats for the given year. Order the results in descending order first by batting average, then by RBIs, and lastly by home runs.
- Calculate the number of MVPs and Triple Crown winners by position ever in major league baseball.
- Calculate the number of MVPs and Triple Crown winners by team ever in major league baseball.
- Calculate the average batting average, RBIs, and home runs by position ever in major league baseball. Only consider seasons where a player had at least 300 at-bats (AB).
- Return all player info for all players that won an MVP and a Gold Glove during their careers, along with the number of times they won each.
- Calculate the number of world series, division titles, and league championships for all teams.
- Calculate the average salary (as a percentage of yearly average) of all MVPs ever in major league baseball.
- Use the statistics available at Baseball-Reference to add Miguel Cabrera's 2012 Triple Crown season to your Database. Make sure to add the appropriate information to all relevant tables.
Try to solve as many of these problems as you can. Some might be quite complex, feel free to stop if you feel that you have a good handle on SQL.
Submit:
- Your queries
- Your results (truncated is fine for large resultsets)
- A screenshot of your DB tables on either the command line or in MySQL Workbench
Sample Solutions
SELECT m.nameFirst, m.nameLast, a.awardID, a.yearID, b.*, b.H/b.AB as battingAvg
FROM AwardsPlayers a
INNER JOIN Master m ON a.playerID = m.playerID
INNER JOIN Batting b ON a.playerID = b.playerID AND a.yearID = b.yearID
WHERE a.awardID = 'Triple Crown'
ORDER BY battingAvg DESC, b.RBI DESC, b.HR DESC;
-- Alternative with implicit joins
SELECT m.nameFirst, m.nameLast, a.awardID, a.yearID, b.*, b.H/b.AB as battingAvg
FROM AwardsPlayers a, Master m, Batting b
WHERE
a.awardID = 'Triple Crown' AND
a.playerID = m.playerID AND
a.playerID = b.playerID AND
a.yearID = b.yearID
ORDER BY battingAvg DESC, b.RBI DESC, b.HR DESC;
-- Reasonable implemenation
SELECT COUNT(*) as numWinners, a.awardID, f.POS
FROM AwardsPlayers a
INNER JOIN Fielding f
ON a.playerID = f.playerID AND a.yearID = f.yearID
WHERE a.awardID = 'Triple Crown' OR a.awardID = 'MVP'
GROUP BY a.awardID, f.POS
ORDER BY a.awardID, numWinners DESC;
-- Consolidating outfield positions
SELECT COUNT(*) as numWinners, a.awardID, CASE WHEN f.POS IN ('CF','LF','RF') THEN 'OF' ELSE f.POS END as simplifiedPos
FROM AwardsPlayers a
INNER JOIN Fielding f
ON a.playerID = f.playerID AND a.yearID = f.yearID
WHERE a.awardID = 'Triple Crown' OR a.awardID = 'MVP'
GROUP BY a.awardID, simplifiedPos
ORDER BY a.awardID, numWinners DESC;
-- Single 'primary' position per player per year
SELECT COUNT(*) as numWinners, a.awardID, pp.primaryPosition
FROM AwardsPlayers a
INNER JOIN (
SELECT f.playerID, f.yearID, CASE WHEN f.POS IN ('CF','LF','RF') THEN 'OF' ELSE f.POS END as primaryPosition
FROM Fielding f
INNER JOIN (
SELECT playerID, yearID, MAX(G) as G
FROM Fielding
GROUP BY playerID, yearID)
mf ON f.playerID = mf.playerID AND f.yearID = mf.yearID AND f.G = mf.G
GROUP BY f.playerID, f.yearID, primaryPosition) pp
ON a.playerID = pp.playerID AND a.yearID = pp.yearID
WHERE a.awardID = 'Triple Crown' OR a.awardID = 'MVP'
GROUP BY a.awardID, pp.primaryPosition
ORDER BY a.awardID, numWinners DESC;
-- Assuming award-winning players stuck with one team that year (seems to be true)
-- If not, could use same approach from #2 to come up with a 'primary team' per player per year
SELECT COUNT(*) as numWinners, a.awardID, t.name
FROM AwardsPlayers a
INNER JOIN Batting b ON a.playerID = b.playerID AND a.yearID = b.yearID
INNER JOIN Teams t ON b.yearID = t.yearID AND b.teamID = t.teamID
WHERE a.awardID = 'Triple Crown' OR a.awardID = 'MVP'
GROUP BY a.awardID, t.name
ORDER BY a.awardID, numWinners DESC;
SELECT AVG(b.H/b.AB) as avgAVG, AVG(b.RBI) as avgRBI, AVG(b.HR) as avgHR, CASE WHEN f.POS IN ('CF','LF','RF') THEN 'OF' ELSE f.POS END as position from Batting b
INNER JOIN Fielding f
ON b.playerID = f.playerID AND b.yearID = f.yearID
WHERE b.AB >= 300
GROUP BY position;
SELECT m.*, a.awardID, COUNT(*) AS numWins FROM AwardsPlayers a
INNER JOIN (
SELECT DISTINCT a1.playerID
FROM AwardsPlayers a1
INNER JOIN (
SELECT DISTINCT playerID
FROM AwardsPlayers
WHERE awardID = 'MVP') a2
ON a1.playerID = a2.playerID
WHERE a1.awardID = 'Triple Crown') dw
ON a.playerID = dw.playerID
INNER JOIN Master m on a.playerID = m.playerID
WHERE a.awardID = 'MVP' OR a.awardID = 'Triple Crown'
GROUP BY a.playerID, a.awardID;
SELECT t.teamID, t.name,
SUM(CASE WHEN t.DivWin = 'Y' THEN 1 ELSE 0 END) AS divisionWins,
SUM(CASE WHEN t.LgWin = 'Y' THEN 1 ELSE 0 END) AS leagueWins,
SUM(CASE WHEN t.WSWin = 'Y' THEN 1 ELSE 0 END) AS worldSeriesWins
FROM Teams t
GROUP BY t.teamID
ORDER BY worldSeriesWins DESC
-- Note that MySQL has a non-standard 'IF' function...
-- But team names are out-of-date
-- soo...
SELECT mrn.mostRecentName,
SUM(CASE WHEN t.DivWin = 'Y' THEN 1 ELSE 0 END) AS divisionWins,
SUM(CASE WHEN t.LgWin = 'Y' THEN 1 ELSE 0 END) AS leagueWins,
SUM(CASE WHEN t.WSWin = 'Y' THEN 1 ELSE 0 END) AS worldSeriesWins
FROM Teams t
INNER JOIN (
SELECT tt.teamID, tt.name AS mostRecentName
FROM Teams tt
INNER JOIN (
SELECT MAX(ttt.yearID) AS lastYear, ttt.teamID
FROM Teams ttt
GROUP BY ttt.teamID) ly
ON tt.teamID = ly.teamID AND tt.yearID=ly.lastYear) mrn
ON t.teamID = mrn.teamID
GROUP BY t.teamID
SELECT m.nameFirst, m.nameLast, a.yearID, s.salary, ys.avgSalary, s.salary / ys.avgSalary * 100 as percentOfAverageSalary from AwardsPlayers a
INNER JOIN Salaries s ON a.playerID = s.playerID AND a.yearID = s.yearID
INNER JOIN (
SELECT yearID, AVG(salary) as avgSalary from Salaries
GROUP BY yearID) ys
ON a.yearID = ys.yearID
LEFT JOIN Master m
ON a.playerID = m.playerID
WHERE awardID = 'MVP'
INSERT INTO Batting
VALUES ("cabremi01", 2012,1,"FLO","NA",161,161,622,109,205,40,0,44,139,4,1,66,98,17,3,0,6,28,161);
INSERT INTO AwardsPlayers
VALUES("cabremi01","Triple Crown", 2012, "NL", null,null);
INSERT INTO Fielding
VALUES("etc","etc")
-- etc