mirror of
https://github.com/donnemartin/data-science-ipython-notebooks.git
synced 2024-03-22 13:30:56 +08:00
107 lines
2.9 KiB
Python
107 lines
2.9 KiB
Python
"""This file contains code for use with "Think Stats",
|
|
by Allen B. Downey, available from greenteapress.com
|
|
|
|
Copyright 2010 Allen B. Downey
|
|
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
|
|
"""
|
|
|
|
from __future__ import print_function
|
|
|
|
from collections import defaultdict
|
|
import numpy as np
|
|
import sys
|
|
|
|
import thinkstats2
|
|
|
|
|
|
def ReadFemPreg(dct_file='2002FemPreg.dct',
|
|
dat_file='2002FemPreg.dat.gz'):
|
|
"""Reads the NSFG pregnancy data.
|
|
|
|
dct_file: string file name
|
|
dat_file: string file name
|
|
|
|
returns: DataFrame
|
|
"""
|
|
dct = thinkstats2.ReadStataDct(dct_file)
|
|
df = dct.ReadFixedWidth(dat_file, compression='gzip')
|
|
CleanFemPreg(df)
|
|
return df
|
|
|
|
|
|
def CleanFemPreg(df):
|
|
"""Recodes variables from the pregnancy frame.
|
|
|
|
df: DataFrame
|
|
"""
|
|
# mother's age is encoded in centiyears; convert to years
|
|
df.agepreg /= 100.0
|
|
|
|
# birthwgt_lb contains at least one bogus value (51 lbs)
|
|
# replace with NaN
|
|
df.birthwgt_lb[df.birthwgt_lb > 20] = np.nan
|
|
|
|
# replace 'not ascertained', 'refused', 'don't know' with NaN
|
|
na_vals = [97, 98, 99]
|
|
df.birthwgt_lb.replace(na_vals, np.nan, inplace=True)
|
|
df.birthwgt_oz.replace(na_vals, np.nan, inplace=True)
|
|
df.hpagelb.replace(na_vals, np.nan, inplace=True)
|
|
|
|
df.babysex.replace([7, 9], np.nan, inplace=True)
|
|
df.nbrnaliv.replace([9], np.nan, inplace=True)
|
|
|
|
# birthweight is stored in two columns, lbs and oz.
|
|
# convert to a single column in lb
|
|
# NOTE: creating a new column requires dictionary syntax,
|
|
# not attribute assignment (like df.totalwgt_lb)
|
|
df['totalwgt_lb'] = df.birthwgt_lb + df.birthwgt_oz / 16.0
|
|
|
|
# due to a bug in ReadStataDct, the last variable gets clipped;
|
|
# so for now set it to NaN
|
|
df.cmintvw = np.nan
|
|
|
|
|
|
def MakePregMap(df):
|
|
"""Make a map from caseid to list of preg indices.
|
|
|
|
df: DataFrame
|
|
|
|
returns: dict that maps from caseid to list of indices into preg df
|
|
"""
|
|
d = defaultdict(list)
|
|
for index, caseid in df.caseid.iteritems():
|
|
d[caseid].append(index)
|
|
return d
|
|
|
|
|
|
def main(script):
|
|
"""Tests the functions in this module.
|
|
|
|
script: string script name
|
|
"""
|
|
df = ReadFemPreg()
|
|
print(df.shape)
|
|
|
|
assert len(df) == 13593
|
|
|
|
assert df.caseid[13592] == 12571
|
|
assert df.pregordr.value_counts()[1] == 5033
|
|
assert df.nbrnaliv.value_counts()[1] == 8981
|
|
assert df.babysex.value_counts()[1] == 4641
|
|
assert df.birthwgt_lb.value_counts()[7] == 3049
|
|
assert df.birthwgt_oz.value_counts()[0] == 1037
|
|
assert df.prglngth.value_counts()[39] == 4744
|
|
assert df.outcome.value_counts()[1] == 9148
|
|
assert df.birthord.value_counts()[1] == 4413
|
|
assert df.agepreg.value_counts()[22.75] == 100
|
|
assert df.totalwgt_lb.value_counts()[7.5] == 302
|
|
|
|
weights = df.finalwgt.value_counts()
|
|
key = max(weights.keys())
|
|
assert df.finalwgt.value_counts()[key] == 6
|
|
|
|
print('%s: All tests passed.' % script)
|
|
|
|
if __name__ == '__main__':
|
|
main(*sys.argv)
|