"""This file contains code for use with "Think Stats", by Allen B. Downey, available from greenteapress.com Copyright 2010 Allen B. Downey License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html """ from __future__ import print_function from collections import defaultdict import numpy as np import sys import thinkstats2 def ReadFemPreg(dct_file='2002FemPreg.dct', dat_file='2002FemPreg.dat.gz'): """Reads the NSFG pregnancy data. dct_file: string file name dat_file: string file name returns: DataFrame """ dct = thinkstats2.ReadStataDct(dct_file) df = dct.ReadFixedWidth(dat_file, compression='gzip') CleanFemPreg(df) return df def CleanFemPreg(df): """Recodes variables from the pregnancy frame. df: DataFrame """ # mother's age is encoded in centiyears; convert to years df.agepreg /= 100.0 # birthwgt_lb contains at least one bogus value (51 lbs) # replace with NaN df.birthwgt_lb[df.birthwgt_lb > 20] = np.nan # replace 'not ascertained', 'refused', 'don't know' with NaN na_vals = [97, 98, 99] df.birthwgt_lb.replace(na_vals, np.nan, inplace=True) df.birthwgt_oz.replace(na_vals, np.nan, inplace=True) df.hpagelb.replace(na_vals, np.nan, inplace=True) df.babysex.replace([7, 9], np.nan, inplace=True) df.nbrnaliv.replace([9], np.nan, inplace=True) # birthweight is stored in two columns, lbs and oz. # convert to a single column in lb # NOTE: creating a new column requires dictionary syntax, # not attribute assignment (like df.totalwgt_lb) df['totalwgt_lb'] = df.birthwgt_lb + df.birthwgt_oz / 16.0 # due to a bug in ReadStataDct, the last variable gets clipped; # so for now set it to NaN df.cmintvw = np.nan def MakePregMap(df): """Make a map from caseid to list of preg indices. df: DataFrame returns: dict that maps from caseid to list of indices into preg df """ d = defaultdict(list) for index, caseid in df.caseid.iteritems(): d[caseid].append(index) return d def main(script): """Tests the functions in this module. script: string script name """ df = ReadFemPreg() print(df.shape) assert len(df) == 13593 assert df.caseid[13592] == 12571 assert df.pregordr.value_counts()[1] == 5033 assert df.nbrnaliv.value_counts()[1] == 8981 assert df.babysex.value_counts()[1] == 4641 assert df.birthwgt_lb.value_counts()[7] == 3049 assert df.birthwgt_oz.value_counts()[0] == 1037 assert df.prglngth.value_counts()[39] == 4744 assert df.outcome.value_counts()[1] == 9148 assert df.birthord.value_counts()[1] == 4413 assert df.agepreg.value_counts()[22.75] == 100 assert df.totalwgt_lb.value_counts()[7.5] == 302 weights = df.finalwgt.value_counts() key = max(weights.keys()) assert df.finalwgt.value_counts()[key] == 6 print('%s: All tests passed.' % script) if __name__ == '__main__': main(*sys.argv)