107 lines
2.9 KiB
Python
Raw Normal View History

"""This file contains code for use with "Think Stats",
by Allen B. Downey, available from greenteapress.com
Copyright 2010 Allen B. Downey
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
"""
from __future__ import print_function
from collections import defaultdict
import numpy as np
import sys
import thinkstats2
def ReadFemPreg(dct_file='2002FemPreg.dct',
dat_file='2002FemPreg.dat.gz'):
"""Reads the NSFG pregnancy data.
dct_file: string file name
dat_file: string file name
returns: DataFrame
"""
dct = thinkstats2.ReadStataDct(dct_file)
df = dct.ReadFixedWidth(dat_file, compression='gzip')
CleanFemPreg(df)
return df
def CleanFemPreg(df):
"""Recodes variables from the pregnancy frame.
df: DataFrame
"""
# mother's age is encoded in centiyears; convert to years
df.agepreg /= 100.0
# birthwgt_lb contains at least one bogus value (51 lbs)
# replace with NaN
df.birthwgt_lb[df.birthwgt_lb > 20] = np.nan
# replace 'not ascertained', 'refused', 'don't know' with NaN
na_vals = [97, 98, 99]
df.birthwgt_lb.replace(na_vals, np.nan, inplace=True)
df.birthwgt_oz.replace(na_vals, np.nan, inplace=True)
df.hpagelb.replace(na_vals, np.nan, inplace=True)
df.babysex.replace([7, 9], np.nan, inplace=True)
df.nbrnaliv.replace([9], np.nan, inplace=True)
# birthweight is stored in two columns, lbs and oz.
# convert to a single column in lb
# NOTE: creating a new column requires dictionary syntax,
# not attribute assignment (like df.totalwgt_lb)
df['totalwgt_lb'] = df.birthwgt_lb + df.birthwgt_oz / 16.0
# due to a bug in ReadStataDct, the last variable gets clipped;
# so for now set it to NaN
df.cmintvw = np.nan
def MakePregMap(df):
"""Make a map from caseid to list of preg indices.
df: DataFrame
returns: dict that maps from caseid to list of indices into preg df
"""
d = defaultdict(list)
for index, caseid in df.caseid.iteritems():
d[caseid].append(index)
return d
def main(script):
"""Tests the functions in this module.
script: string script name
"""
df = ReadFemPreg()
print(df.shape)
assert len(df) == 13593
assert df.caseid[13592] == 12571
assert df.pregordr.value_counts()[1] == 5033
assert df.nbrnaliv.value_counts()[1] == 8981
assert df.babysex.value_counts()[1] == 4641
assert df.birthwgt_lb.value_counts()[7] == 3049
assert df.birthwgt_oz.value_counts()[0] == 1037
assert df.prglngth.value_counts()[39] == 4744
assert df.outcome.value_counts()[1] == 9148
assert df.birthord.value_counts()[1] == 4413
assert df.agepreg.value_counts()[22.75] == 100
assert df.totalwgt_lb.value_counts()[7.5] == 302
weights = df.finalwgt.value_counts()
key = max(weights.keys())
assert df.finalwgt.value_counts()[key] == 6
print('%s: All tests passed.' % script)
if __name__ == '__main__':
main(*sys.argv)