diff --git a/scipy/2002FemPreg.dat.gz b/scipy/2002FemPreg.dat.gz new file mode 100644 index 0000000..2222567 Binary files /dev/null and b/scipy/2002FemPreg.dat.gz differ diff --git a/scipy/2002FemPreg.dct b/scipy/2002FemPreg.dct new file mode 100644 index 0000000..c2741be --- /dev/null +++ b/scipy/2002FemPreg.dct @@ -0,0 +1,245 @@ +infile dictionary { + _column(1) str12 caseid %12s "RESPONDENT ID NUMBER" + _column(13) byte pregordr %2f "PREGNANCY ORDER (NUMBER)" + _column(15) byte howpreg_n %2f "BB-2 # OF WEEKS OR MONTHS CURRENTLY PREGNANT" + _column(17) byte howpreg_p %1f "BB-2 CURRENT PREGNANCY LENGTH REPORTED IN MONTHS OR WEEKS" + _column(18) byte moscurrp %1f "NUMBER OF MONTHS CURRENTLY PREGNANT" + _column(19) byte nowprgdk %1f "BB-3 WHICH TRIMESTER -- CURRENT PREGNANCY" + _column(20) byte pregend1 %1f "BC-1 HOW PREGNANCY ENDED - 1ST MENTION" + _column(21) byte pregend2 %1f "BC-1 HOW PREGNANCY ENDED - 2ND MENTION" + _column(22) byte nbrnaliv %1f "BC-2 NUMBER OF BABIES BORN ALIVE FROM THIS PREGNANCY" + _column(23) byte multbrth %1f "BC-3 WAS THIS A MULTIPLE BIRTH" + _column(24) int cmotpreg %4f "CM FOR PREGNANCY END DATE (IF NONLIVEBIRTH)" + _column(28) byte prgoutcome %1f "OUTCOME OF PREGNANCY (BASED ON PRIORITY ORDERING)" + _column(29) int cmprgend %4f "CM FOR PREGNANCY END DATE (REGARDLESS OF OUTCOME)" + _column(33) byte flgdkmo1 %1f "FLAG INDICATING SEASON/DK/RF FOR BC-4A DATPRGEN_M" + _column(34) int cmprgbeg %4f "CM FOR PREGNANCY START DATE" + _column(38) byte ageatend %2f "BC-4B R'S AGE AT PREGNANCY'S END DATE" + _column(40) byte hpageend %2f "BC-4C FATHER'S AGE AT PREGNANCY'S END DATE" + _column(42) byte gestasun_m %2f "BC-5 GESTATIONAL LENGTH OF PREGNANCY IN MONTHS" + _column(44) byte gestasun_w %2f "BC-5 GESTATIONAL LENGTH OF PREGNANCY IN WEEKS" + _column(46) byte wksgest %2f "GESTATIONAL LENGTH OF COMPLETED PREGNANCY (IN WEEKS)" + _column(48) byte mosgest %2f "GESTATIONAL LENGTH OF COMPLETED PREGNANCY (IN MONTHS)" + _column(50) byte dk1gest %1f "BC-6 DK FOLLOWUP FOR GESTATIONAL LENGTH OF A STILLBIRTH" + _column(51) byte dk2gest %1f "BC-7 DK FOLLOWUP FOR GESTATIONAL LENGTH OF A LIVEBIRTH" + _column(52) byte dk3gest %1f "BC-8 DK FOLLOWUP FOR GESTATIONAL LENGTH OF A MISCARR/ABOR/ECTOP" + _column(53) byte bpa_bdscheck1 %1f "WHETHER 1ST LIVEBORN BABY FROM THIS PREGNANCY WAS BPA OR BDS" + _column(54) byte bpa_bdscheck2 %1f "WHETHER 2ND LIVEBORN BABY FROM THIS PREGNANCY WAS BPA OR BDS" + _column(55) byte bpa_bdscheck3 %1f "WHETHER 3RD LIVEBORN BABY FROM THIS PREGNANCY WAS BPA OR BDS" + _column(56) byte babysex %1f "BD-2 SEX OF 1ST LIVEBORN BABY FROM THIS PREGNANCY" + _column(57) byte birthwgt_lb %2f "BD-3 BIRTHWEIGHT IN POUNDS - 1ST BABY FROM THIS PREGNANCY" + _column(59) byte birthwgt_oz %2f "BD-3 BIRTHWEIGHT IN OUNCES - 1ST BABY FROM THIS PREGNANCY" + _column(61) byte lobthwgt %1f "BD-4 IS BABY LOW BIRTHWEIGHT- 1ST BABY FROM THIS PREGNANCY" + _column(62) byte babysex2 %1f "BD-2 SEX OF 2ND LIVEBORN BABY FROM THIS PREGNANCY" + _column(63) byte birthwgt_lb2 %2f "BD-3 BIRTHWEIGHT IN POUNDS - 2ND BABY FROM THIS PREGNANCY" + _column(65) byte birthwgt_oz2 %2f "BD-3 BIRTHWEIGHT IN OUNCES - 2ND BABY FROM THIS PREGNANCY" + _column(67) byte lobthwgt2 %1f "BD-4 IS BABY LOW BIRTHWEIGHT- 2ND BABY FROM THIS PREGNANCY" + _column(68) byte babysex3 %1f "BD-2 SEX OF 3RD LIVEBORN BABY FROM THIS PREGNANCY" + _column(69) byte birthwgt_lb3 %2f "BD-3 BIRTHWEIGHT IN POUNDS - 3RD BABY FROM THIS PREGNANCY" + _column(71) byte birthwgt_oz3 %2f "BD-3 BIRTHWEIGHT IN OUNCES - 3RD BABY FROM THIS PREGNANCY" + _column(73) byte lobthwgt3 %1f "BD-4 IS BABY LOW BIRTHWEIGHT- 3RD BABY FROM THIS PREGNANCY" + _column(74) int cmbabdob %4f "CM FOR BABY'S OR BABIES' DATE OF BIRTH (DELIVERY DATE)" + _column(78) int kidage %4f "CURRENT AGE (IN MOS) OF R'S CHILD(REN) FROM THIS PREGNANCY" + _column(82) byte hpagelb %2f "BD-6 FATHER'S AGE AT TIME OF CHILD(REN) S BIRTH" + _column(84) byte birthplc %1f "BD-7 PLACE WHERE R GAVE BIRTH" + _column(85) byte paybirth1 %1f "BD-8 PAYMENT FOR DELIVERY - 1ST MENTION" + _column(86) byte paybirth2 %1f "BD-8 PAYMENT FOR DELIVERY - 2ND MENTION" + _column(87) byte paybirth3 %1f "BD-8 PAYMENT FOR DELIVERY - 3RD MENTION" + _column(88) byte knewpreg %2f "BE-1 WEEKS PREGNANT WHEN R LEARNED SHE WAS PREGNANT" + _column(90) byte trimestr %1f "BE-2A DK FOLLOWUP FOR KNEWPREG WHEN GESTATION >= 6 MOS" + _column(91) byte ltrimest %1f "BE-2B DK FOLLOWUP FOR KNEWPREG WHEN GESTATION < 6 MOS" + _column(92) byte priorsmk %1f "BE-3 AMOUNT R'SMOKED IN 6 MOS BEFORE R KNEW SHE WAS PREGNANT" + _column(93) byte postsmks %1f "BE-4 R'SMOKED AT ALL AFTER R KNEW SHE WAS PREGNANT" + _column(94) byte npostsmk %1f "BE-5 AMOUNT R'SMOKED DURING PREGNANCY AFTER R KNEW SHE WAS PREG" + _column(95) byte getprena %1f "BE-6 ANY PRENATAL CARE FOR THIS PREGNANCY" + _column(96) byte bgnprena %2f "BE-7 WEEKS PREGNANT AT FIRST PRENATAL CARE VISIT" + _column(98) byte pnctrim %1f "BE-8A DK FOLLOWUP FOR BGNPRENA WHEN GESTATION >= 6 MOS" + _column(99) byte lpnctri %1f "BE-8B DK FOLLOWUP FOR BGNPRENA WHEN GESTATION < 6 MOS" + _column(100) byte workpreg %1f "BF-1 R WORKED AT ALL DURING THIS PREGNANCY" + _column(101) byte workborn %1f "BF-2 MATERNITY LEAVE TAKEN FOR THIS PREGNANCY" + _column(102) byte didwork %1f "BF-3 WHY NO MATERNITY LEAVE WAS TAKEN FOR THIS PREGNANCY" + _column(103) byte matweeks %2f "BF-4 WEEKS OF MATERNITY LEAVE TAKEN FOR THIS PREGNANCY" + _column(105) byte weeksdk %1f "BF-5 DK FOLLOWUP - WAS MATERNITY LEAVE <=4 OR > 4 WEEKS" + _column(106) byte matleave %2f "BF-6 WEEKS OF PAID MATERNITY LEAVE FOR THIS PREGNANCY" + _column(108) byte matchfound %1f "CHECK ON WHETHER CHILD MATCHES BIO CHILD IN HH ROSTER - 1ST" + _column(109) byte livehere %1f "BG-1 WHETHER CHILD LIVES WITH R - 1ST FROM THIS PREGNANCY" + _column(110) byte alivenow %1f "BG-2 WHETHER CHILD IS STILL ALIVE - 1ST FROM THIS PREGNANCY" + _column(111) int cmkidied %4f "CM FOR CHLD'S DATE OF DEATH - 1ST FROM THIS PREGNANCY" + _column(115) int cmkidlft %4f "CM FOR DATE CHILD STOPPED LIVING W/R - 1ST FROM THIS PREGNANCY" + _column(119) int lastage %3f "AGE (IN MOS) WHEN CHILD LAST LIVED W/R-1ST FROM THIS PREGNANCY" + _column(122) byte wherenow %1f "BG-5 WHERE CHILD LIVES NOW - 1ST FROM THIS PREGNANCY" + _column(123) byte legagree %1f "BG-6 LEGAL AGREEMENT FOR WHERE CHILD LIVES - 1ST FROM THIS PREG" + _column(124) byte parenend %1f "BG-7 IS R STILL LEGAL MOTHER OF CHILD - 1ST FROM THIS PREGNANCY" + _column(125) byte anynurse %1f "BH-1 WHETHER R BREASTFED THIS CHILD AT ALL - 1ST FROM THIS PREG" + _column(126) byte fedsolid %1f "BH-2 HAS R BEGUN SUPPLEMENTATION FOR CHILD - 1ST FROM THIS PREG" + _column(127) int frsteatd_n %3f "BH-3 AGE (MOS/WKS/DAY) WHEN 1ST SUPPLEMENTED - 1ST FROM THIS PREG" + _column(130) byte frsteatd_p %1f "BH-3 UNITS (MOS/WKS/DAYS) FOR FRSTEATD_N - 1ST FROM THIS PREG" + _column(131) int frsteatd %3f "AGE (IN MOS) WHEN 1ST SUPPLEMENTED - 1ST FROM THIS PREG" + _column(134) byte quitnurs %1f "BH-4 HAS R'STOPPED BREASTFEEDING CHILD - 1ST FROM THIS PREG" + _column(135) int ageqtnur_n %3f "BH-5 AGE (MOS/WKS/DAY) WHEN STOPPED BREASTFEEDING - 1ST FROM THIS PREG" + _column(138) byte ageqtnur_p %1f "BH-5 UNITS (MOS/WKS/DAYS) FOR AGEQTNUR_N - 1ST FROM THIS PREG" + _column(139) int ageqtnur %3f "AGE (IN MOS) WHEN R'STOPPED NURSING CHILD - 1ST FROM THIS PREG" + _column(142) byte matchfound2 %1f "CHECK ON WHETHER CHILD MATCHES BIO CHILD IN HH ROSTER - 2ND" + _column(143) byte livehere2 %1f "BG-1 WHETHER CHILD LIVES WITH R - 2ND FROM THIS PREGNANCY" + _column(144) byte alivenow2 %1f "BG-2 WHETHER CHILD IS STILL ALIVE - 2ND FROM THIS PREGNANCY" + _column(145) int cmkidied2 %4f "CM FOR CHLD'S DATE OF DEATH - 2ND FROM THIS PREGNANCY" + _column(149) int cmkidlft2 %4f "CM FOR DATE CHILD STOPPED LIVING W/R - 2ND FROM THIS PREGNANCY" + _column(153) int lastage2 %3f "AGE (IN MOS) WHEN CHILD LAST LIVED W/R - 2ND FROM THIS PREGNANCY" + _column(156) byte wherenow2 %1f "BG-5 WHERE CHILD LIVES NOW - 2ND FROM THIS PREGNANCY" + _column(157) byte legagree2 %1f "BG-6 LEGAL AGREEMENT FOR WHERE CHILD LIVES - 2ND FROM THIS PREG" + _column(158) byte parenend2 %1f "BG-7 IS R STILL LEGAL MOTHER OF CHILD - 2ND FROM THIS PREGNANCY" + _column(159) byte anynurse2 %1f "BH-1 WHETHER R BREASTFED THIS CHILD AT ALL - 2ND FROM THIS PREG" + _column(160) byte fedsolid2 %1f "BH-2 HAS R BEGUN SUPPLEMENTATION FOR CHILD - 2ND FROM THIS PREG" + _column(161) byte frsteatd_n2 %2f "BH-3 AGE (MOS/WKS/DAY) WHEN 1ST SUPPLEMENTED - 2ND FROM THIS PREG" + _column(163) byte frsteatd_p2 %1f "BH-3 UNITS (MOS/WKS/DAYS) FOR FRSTEATD_N - 2ND FROM THIS PREG" + _column(164) byte frsteatd2 %2f "AGE (IN MOS) WHEN 1ST SUPPLEMENTED - 2ND FROM THIS PREG" + _column(166) byte quitnurs2 %1f "BH-4 HAS R'STOPPED BREASTFEEDING CHILD - 2ND FROM THIS PREG" + _column(167) byte ageqtnur_n2 %2f "BH-5 AGE (MOS/WKS/DAY) WHEN STOPPED BREASTFEEDING - 2ND FROM THIS PREG" + _column(169) byte ageqtnur_p2 %1f "BH-5 UNITS (MOS/WKS/DAYS) FOR AGEQTNUR_N - 2ND FROM THIS PREG" + _column(170) byte ageqtnur2 %2f "AGE (IN MOS) WHEN R'STOPPED NURSING CHILD - 2ND FROM THIS PREG" + _column(172) byte matchfound3 %1f "CHECK ON WHETHER CHILD MATCHES BIO CHILD IN HH ROSTER - 3RD" + _column(173) byte livehere3 %1f "BG-1 WHETHER CHILD LIVES WITH R - 3RD FROM THIS PREGNANCY" + _column(174) byte alivenow3 %1f "BG-2 WHETHER CHILD IS STILL ALIVE - 3RD FROM THIS PREGNANCY" + _column(175) int cmkidied3 %4f "CM FOR CHLD'S DATE OF DEATH - 3RD FROM THIS PREGNANCY" + _column(179) int cmkidlft3 %4f "CM FOR DATE CHILD STOPPED LIVING W/R - 3RD FROM THIS PREGNANCY" + _column(183) int lastage3 %3f "AGE (IN MOS) WHEN CHILD LAST LIVED W/R - 3RD FROM THIS PREGNANCY" + _column(186) byte wherenow3 %1f "BG-5 WHERE CHILD LIVES NOW - 3RD FROM THIS PREGNANCY" + _column(187) byte legagree3 %1f "BG-6 LEGAL AGREEMENT FOR WHERE CHILD LIVES - 3RD FROM THIS PREG" + _column(188) byte parenend3 %1f "BG-7 IS R STILL LEGAL MOTHER OF CHILD - 3RD FROM THIS PREGNANCY" + _column(189) byte anynurse3 %1f "BH-1 WHETHER R BREASTFED THIS CHILD AT ALL - 3RD FROM THIS PREG" + _column(190) byte fedsolid3 %1f "BH-2 HAS R BEGUN SUPPLEMENTATION FOR CHILD - 3RD FROM THIS PREG" + _column(191) byte frsteatd_n3 %1f "BH-3 AGE (MOS/WKS/DAY) WHEN 1ST SUPPLEMENTED - 3RD FROM THIS PREG" + _column(192) byte frsteatd_p3 %1f "BH-3 UNITS (MOS/WKS/DAYS) FOR FRSTEATD_N - 3RD FROM THIS PREG" + _column(193) byte frsteatd3 %1f "AGE (IN MOS) WHEN 1ST SUPPLEMENTED - 3RD FROM THIS PREG" + _column(194) byte quitnurs3 %1f "BH-4 HAS R'STOPPED BREASTFEEDING CHILD - 3RD FROM THIS PREG" + _column(195) byte ageqtnur_n3 %1f "BH-5 AGE (MOS/WKS/DAY) WHEN STOPPED BREASTFEEDING - 3RD FROM THIS PREG" + _column(196) byte ageqtnur_p3 %1f "BH-5 UNITS (MOS/WKS/DAYS) FOR AGEQTNUR_N - 3RD FROM THIS PREG" + _column(197) byte ageqtnur3 %1f "AGE (IN MOS) WHEN R'STOPPED NURSING CHILD - 3RD FROM THIS PREG" + _column(198) int cmlastlb %4f "CM FOR R'S MOST RECENT LIVE BIRTH" + _column(202) int cmfstprg %4f "CM FOR R'S FIRST COMPLETED PREGNANCY" + _column(206) int cmlstprg %4f "CM FOR R'S MOST RECENT COMPLETED PREGNANCY" + _column(210) int cmintstr %4f "CM FOR DATE OF BEGINNING OF PREGNANCY INTERVAL" + _column(214) int cmintfin %4f "CM FOR DATE OF END OF PREGNANCY INTERVAL" + _column(218) int cmintstrop %4f "OPEN INTERVAL: CM OF DATE OF BEGINNING" + _column(222) int cmintfinop %4f "OPEN INTERVAL: CM OF DATE OF END (MON OF INTERVIEW)" + _column(226) int cmintstrcr %4f "CURRENTLY PREGNANT: CM OF DATE OF BEGINNING OF INTERVAL" + _column(230) int cmintfincr %4f "CURRENTLY PREGNANT: CM OF DATE OF END OF INTERVAL (MON OF INTERVIEW)" + _column(234) byte evuseint %1f "EG-1 USE ANY METHOD IN PREGNANCY INTERVAL?" + _column(235) byte stopduse %1f "EG-2 BEFORE YOU BECAME PREG, STOP USING ALL METHODS?" + _column(236) byte whystopd %1f "EG-3 STOP USING METHODS BEFORE PREG BECAUSE WANTED PREG?" + _column(237) byte whatmeth01 %2f "EG-4 METHOD(S) USING WHEN BECAME PREG - 1ST MENTION" + _column(239) byte whatmeth02 %2f "EG-4 METHOD(S) USING WHEN BECAME PREG - 2ND MENTION" + _column(241) byte whatmeth03 %2f "EG-4 METHOD(S) USING WHEN BECAME PREG - 3RD MENTION" + _column(243) byte whatmeth04 %2f "EG-4 METHOD(S) USING WHEN BECAME PREG - 4TH MENTION" + _column(245) byte resnouse %1f "EG-5 REASON NOT USING/HAD STOPPED USING METHOD BEC. WANTED PREG?" + _column(246) byte wantbold %1f "EG-6 RIGHT BEF PREG, WANT TO HAVE BABY AT ANY TIME IN FUTURE?" + _column(247) byte probbabe %1f "EG-7 PROBABLY WANT BABY AT ANY TIME OR NOT?" + _column(248) byte cnfrmno %1f "EG-8 VERIFY DIDN'T WANT BABY AT ANY TIME IN FUTURE" + _column(249) byte wantbld2 %1f "EG-9 RIGHT BEFORE PREG, WANT TO HAVE BABY AT ANY TIME IN FUTURE? (2ND ASKING)" + _column(250) byte timingok %1f "EG-10 BECOME PREG TOO SOON, RIGHT TIME, OR LATER THAN YOU WANTED?" + _column(251) int toosoon_n %3f "EG-11 HOW MUCH SOONER THAN WANTED BECAME PREG (MONTHS OR YEARS)" + _column(254) byte toosoon_p %1f "EG-11 CHOOSE MONS OR YRS FOR HOW MUCH SOONER BECAME PREG THAN WANTED" + _column(255) byte wthpart1 %1f "EG-12A RIGHT BEFORE PREG, WANT TO HAVE BABY WITH THAT PARTNER?" + _column(256) byte wthpart2 %1f "EG-12B RIGHT BEF. PREG, THINK MIGHT EVER WANT TO HAVE BABY W/THAT PARTNER?" + _column(257) byte feelinpg %2f "EG-13 HAPPINESS TO BE PREG. SCALE (1-10)" + _column(259) byte hpwnold %1f "EG-16 RIGHT BEF PREG, DID THE FATHER WANT R TO HAVE BABY AT ANY TIME IN FUTURE?" + _column(260) byte timokhp %1f "EG-17 R BECAME PREG SOONER, RIGHT TIME, OR LATER THAN FATHER OF PREG WANTED" + _column(261) byte cohpbeg %1f "EG-18A WAS R LIVING W/FATHER OF PREG AT BEGINNING OF PREG" + _column(262) byte cohpend %1f "EG-18B WAS R LIVING W/FATHER OF PREG WHEN PREG ENDED/BABY WAS BORN" + _column(263) byte tellfath %1f "EG-19 DID R TELL FATHER OF PREG THAT SHE WAS PREGNANT" + _column(264) byte whentell %1f "EG-20 WHEN DID R TELL FATHER OF PREG ABOUT PREGNANCY: DURING OR AFTER?" + _column(265) byte tryscale %2f "EG-21 HOW HARD TRYING TO GET/AVOID PREGNANCY (0-10)" + _column(267) byte wantscal %2f "EG-22 HOW MUCH WANTED TO GET/AVOID PREGNANCY (0-10)" + _column(269) byte whyprg1 %1f "EG-23 (UNINTENDED PREG): METHOD FAIL OR R WASN T USING PROPERLY-1ST MENTION" + _column(270) byte whyprg2 %1f "EG-23 (UNINTENDED PREG): METHOD FAIL OR R WASN T USING PROPERLY-2ND MENTION" + _column(271) byte whynouse1 %1f "EG-24 (UNINTENDED PREG) REASON DIDN'T USE METHOD - 1ST MENTION" + _column(272) byte whynouse2 %1f "EG-24 (UNINTENDED PREG) REASON DIDN'T USE METHOD - 2ND MENTION" + _column(273) byte whynouse3 %1f "EG-24 (UNINTENDED PREG) REASON DIDN'T USE METHOD - 3RD MENTION" + _column(274) byte anyusint %1f "ANY METHOD USE IN PREGNANCY INTERVAL" + _column(275) byte prglngth %2f "DURATION OF COMPLETED PREGNANCY IN WEEKS" + _column(277) byte outcome %1f "PREGNANCY OUTCOME" + _column(278) byte birthord %2f "BIRTH ORDER" + _column(280) int datend %4f "CM DATE PREGNANCY ENDED" + _column(284) int agepreg %4f "AGE AT PREGNANCY OUTCOME" + _column(288) int datecon %4f "CM DATE OF CONCEPTION" + _column(292) int agecon %4f "AGE AT TIME OF CONCEPTION" + _column(296) byte fmarout5 %1f "FORMAL MARITAL STATUS AT PREGNANCY OUTCOME" + _column(297) byte pmarpreg %1f "WHETHER PREGNANCY ENDED BEFORE R'S 1ST MARRIAGE (PREMARITALLY)" + _column(298) byte rmarout6 %1f "INFORMAL MARITAL STATUS AT PREGNANCY OUTCOME - 6 CATEGORIES" + _column(299) byte fmarcon5 %1f "FORMAL MARITAL STATUS AT CONCEPTION - 5 CATEGORIES" + _column(300) byte learnprg %2f "NUMBER OF WEEKS PREGNANT WHEN R LEARNED SHE WAS PREGNANT" + _column(302) byte pncarewk %2f "NUMBER OF WEEKS PREGNANT AT FIRST PRENATAL CARE" + _column(304) byte paydeliv %1f "PAYMENT FOR DELIVERY" + _column(305) byte lbw1 %1f "LOW BIRTHWEIGHT - BABY 1" + _column(306) int bfeedwks %3f "DURATION OF BREASTFEEDING IN WEEKS" + _column(309) byte maternlv %1f "USE OF MATERNITY LEAVE" + _column(310) byte oldwantr %1f "WANTEDNESS OF PREGNANCY - RESPONDENT - CYCLE 4 VERSION" + _column(311) byte oldwantp %1f "WANTEDNESS OF PREG - R'S PARTNER (FATHER OF PREGNANCY) - CYCLE 4 VERSION" + _column(312) byte wantresp %1f "WANTEDNESS OF PREGNANCY - RESPONDENT - CYCLE 5 VERSION" + _column(313) byte wantpart %1f "WANTEDNESS OF PREG - R'S PARTNER (FATHER OF PREGNANCY) - CYCLE 5 VERSION" + _column(314) int cmbirth %4f "CENTURY MONTH OF R'S BIRTH" + _column(318) byte ager %2f "AGE AT INTERVIEW" + _column(320) byte agescrn %2f "R'S AGE AT SCREENER" + _column(322) byte fmarital %1f "FORMAL MARITAL STATUS" + _column(323) byte rmarital %1f "INFORMAL MARITAL STATUS" + _column(324) byte educat %2f "EDUCATION (COMPLETED YEARS OF SCHOOLING)" + _column(326) byte hieduc %2f "HIGHEST COMPLETED YEAR OF SCHOOL OR DEGREE" + _column(328) byte race %1f "RACE" + _column(329) byte hispanic %1f "HISPANIC ORIGIN" + _column(330) byte hisprace %1f "RACE AND HISPANIC ORIGIN" + _column(331) byte rcurpreg %1f "PREGNANT AT TIME OF INTERVIEW" + _column(332) byte pregnum %2f "CAPI-BASED TOTAL NUMBER OF PREGNANCIES" + _column(334) byte parity %2f "TOTAL NUMBER OF LIVE BIRTHS" + _column(336) byte insuranc %1f "HEALTH INSURANCE COVERAGE STATUS" + _column(337) byte pubassis %1f "WHETHER R RECEIVED PUBLIC ASSISTANCE IN 2001" + _column(338) int poverty %3f "POVERTY LEVEL INCOME" + _column(341) byte laborfor %1f "LABOR FORCE STATUS" + _column(342) byte religion %1f "CURRENT RELIGIOUS AFFILIATION" + _column(343) byte metro %1f "PLACE OF RESIDENCE (METROPOLITAN / NONMETROPOLITAN)" + _column(344) byte brnout %1f "IB-8 R BORN OUTSIDE OF US" + _column(345) int yrstrus %4f "YEAR R CAME TO THE UNITED STATES" + _column(349) byte prglngth_i %1f "PRGLNGTH IMPUTATION FLAG" + _column(350) byte outcome_i %1f "OUTCOME IMPUTATION FLAG" + _column(351) byte birthord_i %1f "BIRTHORD IMPUTATION FLAG" + _column(352) byte datend_i %1f "DATEND IMPUTATION FLAG" + _column(353) byte agepreg_i %1f "AGEPREG IMPUTATION FLAG" + _column(354) byte datecon_i %1f "DATECON IMPUTATION FLAG" + _column(355) byte agecon_i %1f "AGECON IMPUTATION FLAG" + _column(356) byte fmarout5_i %1f "FMAROUT5 IMPUTATION FLAG" + _column(357) byte pmarpreg_i %1f "PMARPREG IMPUTATION FLAG" + _column(358) byte rmarout6_i %1f "RMAROUT6 IMPUTATION FLAG" + _column(359) byte fmarcon5_i %1f "FMARCON5 IMPUTATION FLAG" + _column(360) byte learnprg_i %1f "LEARNPRG IMPUTATION FLAG" + _column(361) byte pncarewk_i %1f "PNCAREWK IMPUTATION FLAG" + _column(362) byte paydeliv_i %1f "PAYDELIV IMPUTATION FLAG" + _column(363) byte lbw1_i %1f "LBW1 IMPUTATION FLAG" + _column(364) byte bfeedwks_i %1f "BFEEDWKS IMPUTATION FLAG" + _column(365) byte maternlv_i %1f "MATERNLV IMPUTATION FLAG" + _column(366) byte oldwantr_i %1f "OLDWANTR IMPUTATION FLAG" + _column(367) byte oldwantp_i %1f "OLDWANTP IMPUTATION FLAG" + _column(368) byte wantresp_i %1f "WANTRESP IMPUTATION FLAG" + _column(369) byte wantpart_i %1f "WANTPART IMPUTATION FLAG" + _column(370) byte ager_i %1f "AGER IMPUTATION FLAG" + _column(371) byte fmarital_i %1f "FMARITAL IMPUTATION FLAG" + _column(372) byte rmarital_i %1f "RMARITAL IMPUTATION FLAG" + _column(373) byte educat_i %1f "EDUCAT IMPUTATION FLAG" + _column(374) byte hieduc_i %1f "HIEDUC IMPUTATION FLAG" + _column(375) byte race_i %1f "RACE IMPUTATION FLAG" + _column(376) byte hispanic_i %1f "HISPANIC IMPUTATION FLAG" + _column(377) byte hisprace_i %1f "HISPRACE IMPUTATION FLAG" + _column(378) byte rcurpreg_i %1f "RCURPREG IMPUTATION FLAG" + _column(379) byte pregnum_i %1f "PREGNUM IMPUTATION FLAG" + _column(380) byte parity_i %1f "PARITY IMPUTATION FLAG" + _column(381) byte insuranc_i %1f "INSURANC IMPUTATION FLAG" + _column(382) byte pubassis_i %1f "PUBASSIS IMPUTATION FLAG" + _column(383) byte poverty_i %1f "POVERTY IMPUTATION FLAG" + _column(384) byte laborfor_i %1f "LABORFOR IMPUTATION FLAG" + _column(385) byte religion_i %1f "RELIGION IMPUTATION FLAG" + _column(386) byte metro_i %1f "METRO IMPUTATION FLAG" + _column(387) float basewgt %18f "BASE WEIGHT" + _column(405) double adj_mod_basewgt %18f "ADJUSTED MODIFIED BASE WEIGHT" + _column(423) double finalwgt %18f "FINAL POST-STRATIFIED AND ADJUSTED WEIGHT" + _column(441) byte secu_p %1f "SCRAMBLED VERSION OF THE SAMPLING ERROR COMPUTATIONAL UNIT" + _column(442) byte sest %2f "SCRAMBLED VERSION OF THE STRATUM" + _column(444) int cmintvw %4f "CENTURY MONTH OF INTERVIEW DATE" + } diff --git a/scipy/first.py b/scipy/first.py new file mode 100644 index 0000000..4c5fccf --- /dev/null +++ b/scipy/first.py @@ -0,0 +1,160 @@ +"""This file contains code used in "Think Stats", +by Allen B. Downey, available from greenteapress.com + +Copyright 2014 Allen B. Downey +License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html +""" + +from __future__ import print_function + +import math +import numpy as np + +import nsfg +import thinkstats2 +import thinkplot + + +def MakeFrames(): + """Reads pregnancy data and partitions first babies and others. + + returns: DataFrames (all live births, first babies, others) + """ + preg = nsfg.ReadFemPreg() + + live = preg[preg.outcome == 1] + firsts = live[live.birthord == 1] + others = live[live.birthord != 1] + + assert len(live) == 9148 + assert len(firsts) == 4413 + assert len(others) == 4735 + + return live, firsts, others + + +def Summarize(live, firsts, others): + """Print various summary statistics.""" + + mean = live.prglngth.mean() + var = live.prglngth.var() + std = live.prglngth.std() + + print('Live mean', mean) + print('Live variance', var) + print('Live std', std) + + mean1 = firsts.prglngth.mean() + mean2 = others.prglngth.mean() + + var1 = firsts.prglngth.var() + var2 = others.prglngth.var() + + print('Mean') + print('First babies', mean1) + print('Others', mean2) + + print('Variance') + print('First babies', var1) + print('Others', var2) + + print('Difference in weeks', mean1 - mean2) + print('Difference in hours', (mean1 - mean2) * 7 * 24) + + print('Difference relative to 39 weeks', (mean1 - mean2) / 39 * 100) + + d = thinkstats2.CohenEffectSize(firsts.prglngth, others.prglngth) + print('Cohen d', d) + + +def PrintExtremes(live): + """Plots the histogram of pregnancy lengths and prints the extremes. + + live: DataFrame of live births + """ + hist = thinkstats2.Hist(live.prglngth) + thinkplot.Hist(hist, label='live births') + + thinkplot.Save(root='first_nsfg_hist_live', + title='Histogram', + xlabel='weeks', + ylabel='frequency') + + print('Shortest lengths:') + for weeks, freq in hist.Smallest(10): + print(weeks, freq) + + print('Longest lengths:') + for weeks, freq in hist.Largest(10): + print(weeks, freq) + + +def MakeHists(live): + """Plot Hists for live births + + live: DataFrame + others: DataFrame + """ + hist = thinkstats2.Hist(live.birthwgt_lb, label='birthwgt_lb') + thinkplot.Hist(hist) + thinkplot.Save(root='first_wgt_lb_hist', + xlabel='pounds', + ylabel='frequency', + axis=[-1, 14, 0, 3200]) + + hist = thinkstats2.Hist(live.birthwgt_oz, label='birthwgt_oz') + thinkplot.Hist(hist) + thinkplot.Save(root='first_wgt_oz_hist', + xlabel='ounces', + ylabel='frequency', + axis=[-1, 16, 0, 1200]) + + hist = thinkstats2.Hist(np.floor(live.agepreg), label='agepreg') + thinkplot.Hist(hist) + thinkplot.Save(root='first_agepreg_hist', + xlabel='years', + ylabel='frequency') + + hist = thinkstats2.Hist(live.prglngth, label='prglngth') + thinkplot.Hist(hist) + thinkplot.Save(root='first_prglngth_hist', + xlabel='weeks', + ylabel='frequency', + axis=[-1, 53, 0, 5000]) + + +def MakeComparison(firsts, others): + """Plots histograms of pregnancy length for first babies and others. + + firsts: DataFrame + others: DataFrame + """ + first_hist = thinkstats2.Hist(firsts.prglngth, label='first') + other_hist = thinkstats2.Hist(others.prglngth, label='other') + + width = 0.45 + thinkplot.PrePlot(2) + thinkplot.Hist(first_hist, align='right', width=width) + thinkplot.Hist(other_hist, align='left', width=width) + + thinkplot.Save(root='first_nsfg_hist', + title='Histogram', + xlabel='weeks', + ylabel='frequency', + axis=[27, 46, 0, 2700]) + + +def main(script): + live, firsts, others = MakeFrames() + + MakeHists(live) + PrintExtremes(live) + MakeComparison(firsts, others) + Summarize(live, firsts, others) + + +if __name__ == '__main__': + import sys + main(*sys.argv) + + diff --git a/scipy/hypothesis.ipynb b/scipy/hypothesis.ipynb new file mode 100644 index 0000000..19317b0 --- /dev/null +++ b/scipy/hypothesis.ipynb @@ -0,0 +1,652 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Hypothesis Testing\n", + "=============================\n", + "\n", + "Credits: Forked from [CompStats](https://github.com/AllenDowney/CompStats) by Allen Downey. License: [Creative Commons Attribution 4.0 International](http://creativecommons.org/licenses/by/4.0/)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from __future__ import print_function, division\n", + "\n", + "import numpy\n", + "import scipy.stats\n", + "\n", + "import matplotlib.pyplot as pyplot\n", + "\n", + "from IPython.html.widgets import interact, fixed\n", + "from IPython.html import widgets\n", + "\n", + "import first\n", + "\n", + "# seed the random number generator so we all get the same results\n", + "numpy.random.seed(19)\n", + "\n", + "# some nicer colors from http://colorbrewer2.org/\n", + "COLOR1 = '#7fc97f'\n", + "COLOR2 = '#beaed4'\n", + "COLOR3 = '#fdc086'\n", + "COLOR4 = '#ffff99'\n", + "COLOR5 = '#386cb0'\n", + "\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Part One\n", + "========\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As an example, let's look at differences between groups. The example I use in _Think Stats_ is first babies compared with others. The `first` module provides code to read the data into three pandas Dataframes." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "live, firsts, others = first.MakeFrames()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The apparent effect we're interested in is the difference in the means. Other examples might include a correlation between variables or a coefficient in a linear regression. The number that quantifies the size of the effect, whatever it is, is the \"test statistic\"." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def TestStatistic(data):\n", + " group1, group2 = data\n", + " test_stat = abs(group1.mean() - group2.mean())\n", + " return test_stat" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For the first example, I extract the pregnancy length for first babies and others. The results are pandas Series objects." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "group1 = firsts.prglngth\n", + "group2 = others.prglngth" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The actual difference in the means is 0.078 weeks, which is only 13 hours." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.078037266777549519" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "actual = TestStatistic((group1, group2))\n", + "actual" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The null hypothesis is that there is no difference between the groups. We can model that by forming a pooled sample that includes first babies and others." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "n, m = len(group1), len(group2)\n", + "pool = numpy.hstack((group1, group2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then we can simulate the null hypothesis by shuffling the pool and dividing it into two groups, using the same sizes as the actual sample." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def RunModel():\n", + " numpy.random.shuffle(pool)\n", + " data = pool[:n], pool[n:]\n", + " return data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The result of running the model is two NumPy arrays with the shuffled pregnancy lengths:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(array([36, 40, 39, ..., 43, 42, 40]), array([43, 39, 32, ..., 37, 35, 41]))" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "RunModel()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then we compute the same test statistic using the simulated data:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.081758440969863955" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "TestStatistic(RunModel())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If we run the model 1000 times and compute the test statistic, we can see how much the test statistic varies under the null hypothesis." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(1000,)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_stats = numpy.array([TestStatistic(RunModel()) for i in range(1000)])\n", + "test_stats.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here's the sampling distribution of the test statistic under the null hypothesis, with the actual difference in means indicated by a gray line." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYwAAAEPCAYAAABRHfM8AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAE0ZJREFUeJzt3X2QnVVhx/HvJQEhxmxIwbzsRoO8TEWCQCyk4svVpim2\nClgqkFaNhaozFHGsM5XoTMnaim/FKqXasQgGlNSMlDSMtSRQbwsdkkAgBFioJCVjNkKCvIaxYrK5\n/eOczT7Zvbs5u3ef+zy7+/3MPHPPPc/b2cPl/nKetwuSJEmSJEmSJEmSJEmSJEmScnQksAHYDHQB\nX4z1M4B1wE+BtcD0zDrLgCeAx4HFLWupJKlwU+LrZGA98DbgK8BfxvrPAF+K5ZMJ4XI4MA/YChzW\nqoZKksphCnAf8CbC6GFmrJ8V30MYXXwms86/Awtb1UBJ0tDy/hf8YYRRwy7gJ8CjhLDYFefvoi88\n5gDdmXW7gfac2ydJSjQ55+3vB04D2oA7gHf1m1+P02CGmidJaqG8A6PXi8CPgAWEUcUs4GlgNrA7\nLrMTmJtZpyPWHeT444+vb9u2LdfGStI4tA04oZkN5HlI6hj6roA6Cvhd4EFgDbA01i8FVsfyGuBi\n4AjgOOBEYGP/jW7bto16ve40StNVV11VeBuGO91///0HpqLbMtb7ssyT/Tm6E3B8s1/qeY4wZgMr\nCKF0GHAzcBchNFYBlwLbgQvj8l2xvgvYB1yGh6QkqTTyDIyHgTMa1D8HLBpknavjJEkqGe9zmOCq\n1WrRTRg37MvRZX+WT6XoBoxAPR6P0wS1adOmA+UFCxYU2BJp7KhUKtDkd74jDElSEgNDkpTEwJAk\nJTEwJElJDAxJUhIDQ5KUxMCQJCUxMCRJSQwMSVISA0OSlMTAkCQlMTAkSUkMDElSEgNDkpTEwJAk\nJTEwJElJDAxJUhIDQ5KUxMCQJCUxMCRJSQwMSVISA0OSlMTAkCQlMTAkSUkMDElSkslFN2Akurq6\nWr7PKVOmMG/evJbvV5LKIs/AmAvcBLwWqAPfBq4FlgN/BjwTl/ss8ONYXgZcAvQAVwBrG234HYve\nm1ebB/X87p/x8p49HHXUUS3ftySVQZ6BsRf4FLAZmApsAtYRwuNrcco6GbgovrYDdwInAfv7b/jE\nC/4ut0YP5oHrl9DT09Py/UpSWeR5DuNpQlgAvAw8RggCgEqD5c8DVhKCZjuwFTgzx/ZJkoahVSe9\n5wGnA+vj+08ADwHfAabHujlAd2adbvoCRpJUsFYExlTgh8AnCSONbwHHAacBTwHXDLFuPffWSZKS\n5H2V1OHArcD3gNWxbndm/vXA7bG8k3CivFdHrBtgx4aVB8rT2k+hrWP+KDVXksaHWq1GrVYb1W02\nOpcwmtteATxLOPndazZhZEGs/y3gjwknu28hnLfoPel9AgNHGfWFl6+m1R64fgnPPrObqVOntnzf\nOtimTZsOlBcsWFBgS6Sxo1KpQJPf+XmOMM4GPghsAR6MdZ8FlhAOR9WBJ4GPx3ldwKr4ug+4DA9J\nSVJp5BkY99D4HMmPG9T1ujpOkqSS8dEgkqQkBoYkKYmBIUlKYmBIkpIYGJKkJAaGJCmJgSFJSmJg\nSJKSGBiSpCQGhiQpiYEhSUpiYEiSkhgYkqQkBoYkKYmBIUlKYmBIkpIYGJKkJAaGJCmJgSFJSmJg\nSJKSGBiSpCQGhiQpiYEhSUpiYAzDnPYOKpVKy6e26UcX/adLEpOLbsBYsuelF1l4+eqW73f9dee3\nfJ+S1J8jDElSEgNDkpTEwJAkJTEwJElJ8gyMucBPgEeBR4ArYv0MYB3wU2AtMD2zzjLgCeBxYHGO\nbZMkDVOegbEX+BTwJmAh8OfAG4ErCYFxEnBXfA9wMnBRfD0H+GbO7Rs7KpMKuZzXS3olZeV5We3T\ncQJ4GXgMaAfOBd4Z61cANUJonAesJATNdmArcCawPsc2jg31nkIu5wUv6ZXUp1X/gp8HnA5sAGYC\nu2L9rvgeYA7QnVmnmxAwkqQSaMWNe1OBW4FPAnv6zavHaTAN5+3YsPJAeVr7KbR1zG+yiZI0vtRq\nNWq12qhuM+/AOJwQFjcDvcdUdgGzCIerZgO7Y/1OwonyXh2xboC5Zy3Jo62SNG5Uq1Wq1eqB952d\nnU1vM89DUhXgO0AX8PVM/RpgaSwvpS9I1gAXA0cAxwEnAhtzbJ8kaRjyHGGcDXwQ2AI8GOuWAV8C\nVgGXEk5uXxjndcX6LmAfcBlDH66SJLVQnoFxD4OPYBYNUn91nCRJJeN9DpKkJAaGJCmJgSFJSmJg\nSJKSGBiSpCQGhiQpiYEhSUpiYEiSkhgYkqQkBoYkKYmBIUlKYmBIkpIYGJKkJAaGJCmJgSFJSmJg\nSJKSGBiSpCQGhiQpiYEhSUpiYEiSkhgYkqQkKYFxV2KdJGkcmzzEvKOAKcCxwIxM/TSgPc9GSZLK\nZ6jA+DjwSWAOsClTvwe4Ls9GSZLKZ6jA+HqcrgCubU1zJEllNVRg9LoWeCswr9/yN+XRIElSOaUE\nxveANwCbgZ5MvYEhSRNISmAsAE4G6jm3RZJUYimX1T4CzB7h9m8AdgEPZ+qWA93Ag3F6T2beMuAJ\n4HFg8Qj3KUnKQcoI41igC9gIvBLr6sC5CeveCPw9Bx++qgNfi1PWycBF8bUduBM4CdifsB9JUs5S\nAmN5E9u/m3CyvL9Kg7rzgJXAXmA7sBU4E1jfxP4lSaMkJTBqOez3E8CHgfuBTwMvEO73yIZDN94g\nKEmlkRIYL9N3wvsI4PBYN22E+/wW8PlY/mvgGuDSQZZteKJ9x4aVB8rT2k+hrWP+CJsiSeNTrVaj\nVquN6jZTAmNqpnwY4dzFwib2uTtTvh64PZZ3AnMz8zpi3QBzz1rSxO4lafyrVqtUq9UD7zs7O5ve\n5nCfVrsfWA2c08Q+s1dcvZ++K6jWABcTRjHHAScSTrSrSJVJVCqVlk9t048u+i+X1E/KCOOCTPkw\nwn0Z/5e4/ZXAO4FjgB3AVUAVOI1wuOlJwjOrIFyJtSq+7gMuw3s/ilfvYeHlq1u+2/XXnd/yfUoa\nWkpgvI++L+59hCuYzkvcfqNjRzcMsfzVcZIklUxKYHwk70ZIksov5RzGXOA24Jk43Uo4IS1JmkBS\nAuNGwgnpOXG6PdZJkiaQlMA4lhAQe+P0XeC1ObZJklRCKYHxLPAhYBLhnMcHgV/k2ShJUvmkBMaf\nAhcCTwNPAR+IdZKkCSTlKqnPE5779Hx8PwP4W+CSvBolSSqflBHGm+kLC4DngDPyaY4kqaxSAqNC\nGFX0mkE4nyFJmkBSDkldA9xLeGxHhXAO4wt5NkqSVD4pgXETsAl4N+ERIe8nPO9JkjSBpAQGwKNx\nkiRNUMN9vLkkaYIyMCRJSQwMSVISA0OSlMTAkCQlMTAkSUkMDElSEgNDkpTEwJAkJTEwJElJDAxJ\nUhIDQ5KUxMCQJCUxMCRJSQwMSVISA0OSlCTvwLgB2AU8nKmbAawDfgqsBaZn5i0DngAeBxbn3DZJ\n0jDkHRg3Auf0q7uSEBgnAXfF9wAnAxfF13OAb7agfZKkRHl/Id8NPN+v7lxgRSyvAM6P5fOAlcBe\nYDuwFTgz5/ZJkhIV8S/4mYTDVMTXmbE8B+jOLNcNtLewXZKkIUwueP/1OA01f4AdG1YeKE9rP4W2\njvmj3CxJGttqtRq1Wm1Ut1lEYOwCZgFPA7OB3bF+JzA3s1xHrBtg7llL8myfJI151WqVarV64H1n\nZ2fT2yzikNQaYGksLwVWZ+ovBo4AjgNOBDa2vHWSpIbyHmGsBN4JHAPsAP4K+BKwCriUcHL7wrhs\nV6zvAvYBlzH04SpJUgvlHRiDHTtaNEj91XGSJJWM9zlIkpIYGJKkJAaGJCmJgSFJSmJgSJKSGBiS\npCQGhiQpiYEhSUpiYEiSkhgYkqQkBoYkKYmBIUlKYmBIkpIYGJKkJAaGJCmJgSFJSmJgSJKSGBiS\npCQGhiQpiYGhcqpMolKpNJwOWmyQZZqZ2qYfXdAfLZXb5KIbIDVU72Hh5asPuVjKMsO1/rrzR32b\n0njgCEOSlMTAkCQlMTAkSUkMDElSEgNDkpTEwJAkJTEwJElJirwPYzvwEtAD7AXOBGYAPwBeH+df\nCLxQTPMkSVlFjjDqQBU4nRAWAFcC64CTgLvie0lSCRR9SKrS7/25wIpYXgF4y60klUTRI4w7gfuB\nj8a6mcCuWN4V30uSSqDIcxhnA08BxxIOQz3eb349TpKkEigyMJ6Kr88AtxHOY+wCZgFPA7OB3Y1W\n3LFh5YHytPZTaOuYn2tDNcHEJ+W22rS26bz4wvMt36/Gp1qtRq1WG9VtFhUYU4BJwB7g1cBioBNY\nAywFvhxfGz6KdO5ZS1rTSk1MiU/KHW0+JVejqVqtUq1WD7zv7OxseptFBcZMwqiitw3fB9YSzmes\nAi6l77JaSVIJFBUYTwKnNah/DljU4rZIkhIUfVmtJGmMMDAkSUkMDElSEgNDkpTEwJAkJTEwJElJ\nDAxJUhIDQ5KUxMCQJCUp8uGDkrIKeugh+OBDpTEwpLIo6KGH4IMPlcZDUpKkJAaGJCmJgSFJSmJg\nSJKSGBiSpCQGhiQpiYEhSUpiYEiSknjjnqTC7jL3DvOxxcCQVNhd5t5hPrZ4SEqSlMTAkCQlMTAk\nSUkMDElSEgNDkpTEwJAkJTEwJElJyhgY5wCPA08Anym4LZLyFG8YLGJqm3500X/9mFO2G/cmAdcB\ni4CdwH3AGuCxIhs1nr3Y/TBtHfOLbsa4YF+OwBA3DObdn+v/4QLvbh+msgXGmcBWYHt8/8/AeRgY\nuXlp5yN+yY0S+3J05d6f3t0+bGU7JNUO7Mi87451kqSClW2EUU9ZqPuuL+fdjgH2/vqVlu9Tksqk\n9QfwhrYQWE448Q2wDNgPZBNiK3B8a5slSWPeNuCEohsxmiYT/qh5wBHAZuCNRTZIklRe7wH+hzCS\nWFZwWyRJkiSNVSk37F0b5z8EnD7MdSeaZvpzO7AFeBDYmF8Tx5RD9edvAvcCvwI+Pcx1J6Jm+nM7\nfj6zDtWXf0L4f3wL8N/AqcNYt5QmEQ5BzQMOp/G5i98H/i2WzwLWD2PdiaaZ/gR4EpiRbxPHlJT+\nPBZ4C/A3HPwF5+dzoGb6E/x8ZqX05W8DbbF8Dk18d5blPozsDXt76bthL+tcYEUsbwCmA7MS151o\nRtqfMzPzy3YFXZFS+vMZ4P44f7jrTjTN9GcvP59BSl/eC7wYyxuAjmGse5CyBEbKDXuDLTMnYd2J\nppn+hHA/zJ2E/2E/mlMbx5Jmbij1ZtSBmu0TP599htuXl9J3ZGHY/x3KcuNe0g17+K+KVM3259uA\nnxMOC6wjHOO8exTaNVal9udorzteNdsnZwNP4ecThteX7wIuIfTfcNcFyjPC2AnMzbyfS0i7oZbp\niMukrDvRjLQ/d8byz+PrM8BthKHrRNbMZ8zP50DN9slT8dXPZ3pfngr8E+FQdO+TD8fsZzPlhr3s\nSdqF9J248Wa/gZrpzynAa2L51YSrKhbn2NaxYDifseUcfJLWz+dAzfSnn8+DpfTl6wjnKhaOYN3S\nanTD3sfj1Ou6OP8h4IxDrDvRjbQ/30D44GwGHsH+7HWo/pxFOB78IuFfcD8Dpg6x7kQ30v708znQ\nofryeuBZwmXI/S9F9rMpSZIkSZIkSZIkSZIkSZIkSePDcvpu2OoEfieW3w48CjwAHAl8lXA9fut/\n6D3NAuAbRTdCksazqxj4WGuAfyQ837/XCwzvGWNleX6aJKkJnyPcdXo3cAvwF7H+u8AFhCdtPgv8\nL/A94F+BfYS7Vy8kPIzuh4Q7WTcCb43rLwduBu4Bvg8cM8RyNwA/ITwy4ROZtn2YcEf8ZuCmWDfY\n/rKqwO0J2896GfgKYeS0jvBIh/+M67wvLjOJMLraGNv1sVg/lfAk102EH8w5N9bPAx4Dvh23ewdh\nhAZwBWHU9hCwcpA2SVJpLCB8wR1JeFbQE/QFxo3AHzYoA+zJlG+h70mcrwO6Ynk5cB/wqoTl7iH8\nmMxvAL8gfDG/iRBkvT/aM/0Q28mqcnBgNNp+f/uB34vlfwHWxuVOJYQjhID4XCy/Kv598+Jyvc9a\nOobQj8R5e+n7tbUf0DdS2xnbBDCtQXskwOG5yuPthC/HX8VpzRDLDnYIahEHPzztNYQH1NXj9l5J\nWO5HhC/WZ4HdhGcavRtYBTwXl39hiO1MAX45SPsabX8mfU8H7vVrwggA4GFCf/QQRgbzYv1iYD7w\nR/H9NOAEwtNGv0joz/2E34t5bVzmSUIoQxiB9G5rCyH8VsdJasjAUFnUOTgIRvLbJxXCz83+usG8\nXyYul63rIfw/0r9tKdsZTKPt95f9lbn9mXX291v+csIhq6yPEEYWZ8TtP0nfoadXMsv1AEfF8h8A\n7yAc7vocIYh6DvmXaMIpy+9hSP8FnE/fIan3jmAbawnH43u9ucnlIITFfwAfoO+Q1NGDbOe0Q7Rv\nNH8A7A7gMvoC5CTC6GYaYeTSQ/jBnNcntOl1QA24kvDbz68exXZqHDEwVBYPEo6rP0T4nY6NQyxb\nH6R8BfCWuI1HOfhR7iNZrlcX8AXCiefNwDWDbOdjDdatZ7aZLQ+l/zKN/t7rY7seIBy2+hbh/MX3\nY5u2AB8inOgearuTCBcEbInb+gbwUkIbJUmSJEmSJEmSJEmSJEmSJEmSJEmS1Iz/BxMO3JjboFOC\nAAAAAElFTkSuQmCC\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "def VertLine(x):\n", + " \"\"\"Draws a vertical line at x.\"\"\"\n", + " pyplot.plot([x, x], [0, 300], linewidth=3, color='0.8')\n", + "\n", + "VertLine(actual)\n", + "pyplot.hist(test_stats, color=COLOR5)\n", + "pyplot.xlabel('difference in means')\n", + "pyplot.ylabel('count')\n", + "None" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The p-value is the probability that the test statistic under the null hypothesis exceeds the actual value." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.14999999999999999" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pvalue = sum(test_stats >= actual) / len(test_stats)\n", + "pvalue" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this case the result is about 15%, which means that even if there is no difference between the groups, it is plausible that we could see a sample difference as big as 0.078 weeks.\n", + "\n", + "We conclude that the apparent effect might be due to chance, so we are not confident that it would appear in the general population, or in another sample from the same population." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Part Two\n", + "========\n", + "\n", + "We can take the pieces from the previous section and organize them in a class that represents the structure of a hypothesis test." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "class HypothesisTest(object):\n", + " \"\"\"Represents a hypothesis test.\"\"\"\n", + "\n", + " def __init__(self, data):\n", + " \"\"\"Initializes.\n", + "\n", + " data: data in whatever form is relevant\n", + " \"\"\"\n", + " self.data = data\n", + " self.MakeModel()\n", + " self.actual = self.TestStatistic(data)\n", + " self.test_stats = None\n", + "\n", + " def PValue(self, iters=1000):\n", + " \"\"\"Computes the distribution of the test statistic and p-value.\n", + "\n", + " iters: number of iterations\n", + "\n", + " returns: float p-value\n", + " \"\"\"\n", + " self.test_stats = numpy.array([self.TestStatistic(self.RunModel()) \n", + " for _ in range(iters)])\n", + "\n", + " count = sum(self.test_stats >= self.actual)\n", + " return count / iters\n", + "\n", + " def MaxTestStat(self):\n", + " \"\"\"Returns the largest test statistic seen during simulations.\n", + " \"\"\"\n", + " return max(self.test_stats)\n", + "\n", + " def PlotHist(self, label=None):\n", + " \"\"\"Draws a Cdf with vertical lines at the observed test stat.\n", + " \"\"\"\n", + " def VertLine(x):\n", + " \"\"\"Draws a vertical line at x.\"\"\"\n", + " pyplot.plot([x, x], [0, max(ys)], linewidth=3, color='0.8')\n", + "\n", + " ys, xs, patches = pyplot.hist(ht.test_stats, color=COLOR4)\n", + " VertLine(self.actual)\n", + " pyplot.xlabel('test statistic')\n", + " pyplot.ylabel('count')\n", + "\n", + " def TestStatistic(self, data):\n", + " \"\"\"Computes the test statistic.\n", + "\n", + " data: data in whatever form is relevant \n", + " \"\"\"\n", + " raise UnimplementedMethodException()\n", + "\n", + " def MakeModel(self):\n", + " \"\"\"Build a model of the null hypothesis.\n", + " \"\"\"\n", + " pass\n", + "\n", + " def RunModel(self):\n", + " \"\"\"Run the model of the null hypothesis.\n", + "\n", + " returns: simulated data\n", + " \"\"\"\n", + " raise UnimplementedMethodException()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`HypothesisTest` is an abstract parent class that encodes the template. Child classes fill in the missing methods. For example, here's the test from the previous section." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "class DiffMeansPermute(HypothesisTest):\n", + " \"\"\"Tests a difference in means by permutation.\"\"\"\n", + "\n", + " def TestStatistic(self, data):\n", + " \"\"\"Computes the test statistic.\n", + "\n", + " data: data in whatever form is relevant \n", + " \"\"\"\n", + " group1, group2 = data\n", + " test_stat = abs(group1.mean() - group2.mean())\n", + " return test_stat\n", + "\n", + " def MakeModel(self):\n", + " \"\"\"Build a model of the null hypothesis.\n", + " \"\"\"\n", + " group1, group2 = self.data\n", + " self.n, self.m = len(group1), len(group2)\n", + " self.pool = numpy.hstack((group1, group2))\n", + "\n", + " def RunModel(self):\n", + " \"\"\"Run the model of the null hypothesis.\n", + "\n", + " returns: simulated data\n", + " \"\"\"\n", + " numpy.random.shuffle(self.pool)\n", + " data = self.pool[:self.n], self.pool[self.n:]\n", + " return data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can run the test by instantiating a DiffMeansPermute object:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "means permute pregnancy length\n", + "p-value = 0.16\n", + "actual = 0.0780372667775\n", + "ts max = 0.173695697482\n" + ] + } + ], + "source": [ + "data = (firsts.prglngth, others.prglngth)\n", + "ht = DiffMeansPermute(data)\n", + "p_value = ht.PValue(iters=1000)\n", + "print('\\nmeans permute pregnancy length')\n", + "print('p-value =', p_value)\n", + "print('actual =', ht.actual)\n", + "print('ts max =', ht.MaxTestStat())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And we can plot the sampling distribution of the test statistic under the null hypothesis." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYwAAAEPCAYAAABRHfM8AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAEn5JREFUeJzt3X+MHOV9x/H3GtsVYIh94Yd/oZhfdnHVgoNLKSRilVLL\nVDWGRoGgENwGRW5JgDRVWlyp5a5NKVFDQhEiIuWXKeAEQWPZamkhiFWhARxc2xiMATu2iinYJDYE\nSJr4zPWP51nf2r49P3u3s8+c7/2SRjv77Mzs1+e5++w8z84MSJIkSZIkSZIkSZIkSZIkSSrQCcAT\nwIvAC8A1sb0b2AasidMFDessAV4FNgLzOlWoJCmvycAZcX4C8DJwGnA98OUBlp8NrAXGATOATcCY\nwquUJCUp8g/ym4QAAHgPeAmYFp9XBlh+IbAM2A1sJQTGWQXWJ0lqQac+wc8A5gDPxOdXA+uAO4GJ\nsW0qoauqbhv9ASNJyqwTgTEBeAi4lnCk8S3gREJ31RvATYOs21d4dZKkJGML3v444GHgPmB5bNvR\n8PodwMo4/zphoLxuemzbx8knn9y3efPm9lcqSYe2zcApw9lAkUcYFUKX0wbg5ob2KQ3zFwPr4/wK\n4NPAeMIRyKnAqv03unnzZvr6+ko1XX/99dlrsKZDqy5rsqZ2T8DJw/2jXuQRxrnA5cDzhK/PAvwl\ncBmhO6oP2AIsjq9tAB6Mj73AVdglJUmlUWRgPMXARzCPDLLODXGSJJWM5zm0QbVazV3CAawpXRnr\nsqY01tRZA50PUXZ9sT9OkpSoUqnAMP/me4QhSUpiYEiSkhgYkqQkBoYkKYmBIUlKYmBIkpIYGJKk\nJAaGJCmJgSFJSmJgSJKSFH0/jEK8//772d57/PjxjBs3Ltv7a3hWr169d/7MM8/MWIk08ozIwDju\nuGOyvO+ePR8wd+4cnnrqmYMvLEmHmBEZGO+//49Z3nfdute44oqHsry3JOXmGIYkKYmBIUlKYmBI\nkpIYGJKkJAaGJCmJgSFJSmJgSJKSGBiSpCQGhiQpiYEhSUpiYEiSkhgYkqQkBoYkKYmBIUlKYmCM\nQF1dk6hUKtmmrq5JuX8EkjIYkffDGO127Xqbvr7bs71/pbI423tLyscjDElSEgNDkpTEwJAkJXEM\no0UbN26iUqnkLkOSOs7AaNEvf9mbdcAZHHSWlEeRXVInAE8ALwIvANfE9i7gMeAV4FFgYsM6S4BX\ngY3AvAJrkyS1qMjA2A38KfBrwNnAF4DTgOsIgTETeDw+B5gNXBof5wO3FVyfJKkFRf5BfhNYG+ff\nA14CpgEXAktj+1Lgoji/EFhGCJqtwCbgrALrkyS1oFOf4GcAc4BngeOB7bF9e3wOMBXY1rDONkLA\nSJJKoBOD3hOAh4FrgXf3e60vTs0M+Fp398q989XqTKrVWcMsUZIOLbVajVqt1tZtFh0Y4whh8c/A\n8ti2HZhM6LKaAuyI7a8TBsrrpse2A3R3LyiiVkk6ZFSrVarV6t7nPT09w95mkV1SFeBOYANwc0P7\nCmBRnF9Ef5CsAD4NjAdOBE4FVhVYnySpBUUeYZwLXA48D6yJbUuAG4EHgSsJg9uXxNc2xPYNQC9w\nFYN3V0mSOqjIwHiK5kcw5zdpvyFOkqSS8TwHSVISA0OSlMTAkCQlMTAkSUkMDElSEgNDkpTEwJAk\nJTEwJElJDAxJUhIDQ5KUxMCQJCUxMCRJSQwMSVISA0OSlMTAkCQlMTAkSUkMDElSEgNDkpTEwJAk\nJTEwJElJDAxJUhIDQ5KUxMCQJCUxMCRJSQwMSVISA0OSlMTAkCQlMTAkSUkMDElSEgNDkpTEwJAk\nJTEwJElJDAxJUhIDQ5KUxMCQJCUpOjDuArYD6xvauoFtwJo4XdDw2hLgVWAjMK/g2iRJLSg6MO4G\n5u/X1gd8A5gTp0di+2zg0vg4H7itA/VJkhIV/Qf5SWDXAO2VAdoWAsuA3cBWYBNwVmGVSZJakusT\n/NXAOuBOYGJsm0roqqrbBkzrcF2SpCZyBMa3gBOBM4A3gJsGWbavIxVJkg5qbIb33NEwfwewMs6/\nDpzQ8Nr02HaA7u6Ve+er1ZlUq7PaXKIkjWy1Wo1ardbWbeYIjCmEIwuAi+n/BtUK4AHCgPg04FRg\n1UAb6O5eUHCJkjSyVatVqtXq3uc9PT3D3mbRgbEMOA84BngNuB6oErqj+oAtwOK47AbgwfjYC1yF\nXVKSVBpFB8ZlA7TdNcjyN8RJklQynucgSUpiYEiSkhgYkqQkBoYkKYmBIUlKYmBIkpLkOHFPI9zY\nsWOoVAa6fmRnTJo0kZ07B7qmpaQipQTG48DvJLRplOjt/YC+vtuzvX+lsvjgC0lqu8EC43DgCOBY\noKuh/Wi8iqwkjTqDBcZi4FrCZcdXN7S/C9xaZFGSpPIZLDBujtM1wC2dKUeSVFYpYxi3AOcAM/Zb\n/t4iCpIklVNKYNwHnASsBfY0tBsYkjSKpATGmcBsvNS4JI1qKSfuvUC46ZEkaRRLOcI4lnBTo1XA\nL2JbH3BhUUVJksonJTC6iy5CklR+KYFRK7oISVL5pQTGe/QPeI8HxsW2o4sqSpJUPimBMaFhfgxh\n7OLsYsqRJJVVq5c3/wBYDswvoBZJUomlHGF8smF+DOG8jJ8XU44kqaxSAmMB/WMYvcBWYGFRBUmS\nyiklMP6w6CIkSeWXMoZxAvA94K04PQxML7IoSVL5pATG3cAKwn0xpgIrY5skaRRJCYxjCQGxO073\nAMcVWJMkqYRSAuMnwGeBwwhjHpcDPy6yKElS+aQExh8BlwBvAm8An4ptkqRRJOVbUn8DXAHsis+7\ngK8DnyuqKElS+aQcYZxOf1gA7AQ+Wkw5kqSySgmMCuGooq6LMJ4hSRpFUrqkbgKeBh4khMengL8r\nsihJUvmkBMa9wGrgE4RLhFxMuAOfJGkUSQkMgBfjJEkapVq9vLkkaZQqOjDuArYD6xvauoDHgFeA\nR4GJDa8tAV4FNgLzCq5NktSCogPjbg682dJ1hMCYCTwenwPMBi6Nj/OB2zpQnyQpUdF/kJ9k33M4\nINzidWmcXwpcFOcXAssI16vaCmwCziq4PklSohyf4I8ndFMRH4+P81OBbQ3LbQOmdbAuSdIgcnf5\n9NF/N79mr0uSSiD1a7XttB2YTLiY4RRgR2x/nXCzprrpse0A3d0r985XqzOpVmcVUqgkjVS1Wo1a\nrdbWbeYIjBXAIuBr8XF5Q/sDwDcIXVGnAqsG2kB394Liq5SkEaxarVKtVvc+7+npGfY2iw6MZcB5\nwDHAa8BfAzcSLjNyJWFw+5K47IbYvgHoBa7CLilJKo2iA+OyJu3nN2m/IU6SpJLJPegtSRohDAxJ\nUhIDQ5KUxMCQJCUxMDTijB07hkqlMqSp0VC30dU1KdO/XMorx3kY0rD09n5AX9/tQ1p39er++aFu\no1JZPKT1pJHOIwxJUhIDQ5KUxMCQJCUxMCRJSQwMSVISA0OSlMTAkCQlMTAkSUkMDElSEgNDkpTE\nwJAkJTEwJElJDAxJUhIDQ5KUxMCQJCUxMCRJSQwMSVISA0OSlMTAkCQlMTAkSUkMDElSEgNDkpTE\nwJAkJTEwJElJDAxJUhIDQ5KUxMCQJCUxMCRJSQwMSVISA0OSlGRsxvfeCvwU2APsBs4CuoDvAh+J\nr18CvJ2nPElSo5xHGH1AFZhDCAuA64DHgJnA4/G5JKkEcndJVfZ7fiGwNM4vBS7qbDmSpGZyH2F8\nH3gO+HxsOx7YHue3x+dSqYwdO4ZKpZJ16uqalPvHoFEo5xjGucAbwLGEbqiN+73eF6cDdHev3Dtf\nrc6kWp1VUInSgXp7P6Cv7/asNVQqi7O+v8qvVqtRq9Xaus2cgfFGfHwL+B5hHGM7MBl4E5gC7Bho\nxe7uBZ2oT5JGrGq1SrVa3fu8p6dn2NvM1SV1BHBUnD8SmAesB1YAi2L7ImB550uTJA0k1xHG8YSj\ninoN9wOPEsYzHgSupP9rtZKkEsgVGFuAMwZo3wmc3+FaJEkJcn+tVpI0QhgYkqQkBoYkKYmBIUlK\nYmBIkpIYGJKkJAaGJCmJgSFJSmJgSJKS5Lz4oKQhql9iPZdJkyayc+eubO+vPAwMaQTKfYl1L68+\nOtklJUlKYmBIkpIYGJKkJAaGJCmJgSFJSmJgSJKSGBiSpCQGhiQpiYEhSUrimd6SWualSUYnA0NS\ny7w0yehkl5QkKYmBIUlKYmBIkpIYGJKkJAaGJCmJgSFJSmJgSJKSGBiSpCQGhiQpiYEhSUpiYEiS\nkhgYkqQkXnxQ0oiT+2q5MDqvmFvGwJgP3AwcBtwBfC1vOZLKJvfVcgHGjfuTUXeJ97IFxmHArcD5\nwOvAD4EVwEs5izqYWu1lqtVZucvYhzWlK2Nd1pQmZ03NQqtTNeW4xHvZxjDOAjYBW4HdwHeAhTkL\nSlGrvZK7hANYU7oy1mVNaayps8oWGNOA1xqeb4ttkqTMytYl1Zey0IIF3y66jgG9887PsryvJJVB\n3q8ZHOhsoJsw8A2wBPiAfQe+NwEnd7YsSRrxNgOn5C6incYS/lEzgPHAWuC0nAVJksrrAuBlwpHE\nksy1SJIkSRqp5gMbgVeBv2iyzC3x9XXAnBbX7XRdJwBPAC8CLwDXlKCmusOANcDKktQ0EXiIcL7N\nBsJYVu6alhD+79YDDwC/0qGafhV4Gvg/4M9aXDdHXTn388F+VpBnPx+splz7+WA1FbWfF+owQhfU\nDGAcA49d/B7wb3H+t4BnWlg3R12TgTPi/ARCN1s76hpOTXVfBu4nnBTZDsOtaSnwuTg/FvhQ5ppm\nAD+i/5fnu8CiDtV0LDAX+Cr7/nLn3s+b1ZVzP29WU12O/XywmnLt581qmkGL+3lZzsNIOWHvQsIP\nHOBZQlpPTly303UdD7xJ+M8DeI/wqWJq5poAphP+UN5B+74lN5yaPgR8HLgrvtYLvJO5pp/GdY4g\n/GIfQbjyQCdqegt4Lr7e6ro56sq5nzerCfLt581qyrmfN6up5f28LIGRcsJes2WmJqzb6bqm77fM\nDEJ3x7MZa6ov803gK4SvK7fLcH5OJxJ26LuB/wb+ibDj5qppGrATuAn4H+B/gbeB73eopiLW7dS2\nZ9DZ/XwwufbzZnLu5820vJ+XJTCSTtij8+eNDLWuxvUmEPotryV8AstVUwX4fWAHoV+3nT/L4fyc\nxgIfBW6Lj+8D12WsCcJ5Pl8i/AGcSvg//EwHa2r3up3Ydq79fCC59/OB5N7PB9Lyfl6WwHidMHhW\ndwIhKQdbZnpcJmXdTtdVP6wbBzwM3AcsL0FN5xC6YbYAy4BPAPdmrmlbnH4Y2x8i/ELlrGku8APg\nJ4Sug38h/Ow6UVMR6xa97Vz7eTM59/Nmcu7nzRS1nxcu5YS9xgHKs+kfoCzyZL/h1FUh7KTfbFMt\n7aip0Xm079sjw63pP4GZcb6b9lzSfjg1nUH4xs/hhP/HpcAXOlRTXTf7DlDm3s+b1ZVzP29WU6NO\n7+eD1ZRrP29W0+kUs593xEAn7C2OU92t8fV17JvORZ7sN9S6PkboP11LODReQ/8lT3LV1Og82vft\nkeHWdDrhk9c6wqecdnx7ZLg1/Tn9XzdcSvgU3YmaJhP6pN8BdhH6lycMsm67DLWunPv5YD+ruk7v\n54PVlGs/H6ymovZzSZIkSZIkSZIkSZIkSZIk6VB1WO4CpAJ8iHBV0OeGuP6XCN+V701cfiHhqgk/\nbnG5HsLv4JY2LS9JatEMwolIQ7UF+HALy98DfLKNyw11eUlSi74D/Ixw1nH98gtfAVYRjhy6Y9uR\nwL8SzlJeD1wCXA38AngeeHyAbd9IODN2HfAPwG8TrsXzI8JVSE8CPh/fay3hmkGHE67Rs/9y99Af\nCCnbbVz+N4H/iu/xLAee4SxJSvAR9j3CmAfcHufHEK4t9HHgD4BvNyx3VHzcAnQNsN0PE+5sVnd0\nfLw7bquucd2/Bb7YZLn689Tt1p+PJ1w/6MzYPgG7l9UBZblardRO+1/Sel6c1gCrgVnAKYRQ+V3C\np/uPAe8eZLtvE25zeSdwMfDzJu/568CThKOUzwCzB6mtle3Wn88C3oj/FgiXE99zkNqlYTMwNFr8\nPeHmPnMIVwy9m3AP5DmE4Pgq8FcH2cYewh3OHiLcc+HfG15rvC/BPcBVwG8QBqoPb7IchABI3e5g\nbVLhDAwdit6lv3sJ4D8I35o6Mj6fRrjP8RTCJ/v7ga8TwqO+/tEc6EjCbVwfIdwv+vQmy08g3Lp0\nHHA5/X/gh7td4rZejrXPjW1HYZeUJA3Z/YQjh/qg9zWELqLnCYPFJxG6qdYRuqpW0X958y8SxhT2\nH/SeTBhgXhe389nYfg5hwHp13O4fEwarnwVuof8+zvsvVx+TSN1u45jGXOBpwqD3D+gPQ0mSJEmS\nJEmSJEmSJEmSJEmSJEmSJI0E/w/hy3nGuRWPsgAAAABJRU5ErkJggg==\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "ht.PlotHist()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As an exercise, write a class named `DiffStdPermute` that extends `DiffMeansPermute` and overrides `TestStatistic` to compute the difference in standard deviations. Is the difference in standard deviations statistically significant?" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "std permute pregnancy length\n", + "p-value = 0.155\n", + "actual = 0.176049064229\n", + "ts max = 0.44299505029\n" + ] + } + ], + "source": [ + "class DiffStdPermute(DiffMeansPermute):\n", + " \"\"\"Tests a difference in means by permutation.\"\"\"\n", + "\n", + " def TestStatistic(self, data):\n", + " \"\"\"Computes the test statistic.\n", + "\n", + " data: data in whatever form is relevant \n", + " \"\"\"\n", + " group1, group2 = data\n", + " test_stat = abs(group1.std() - group2.std())\n", + " return test_stat\n", + "\n", + "data = (firsts.prglngth, others.prglngth)\n", + "ht = DiffStdPermute(data)\n", + "p_value = ht.PValue(iters=1000)\n", + "print('\\nstd permute pregnancy length')\n", + "print('p-value =', p_value)\n", + "print('actual =', ht.actual)\n", + "print('ts max =', ht.MaxTestStat())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's run DiffMeansPermute again to see if there is a difference in birth weight between first babies and others." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "means permute birthweight\n", + "p-value = 0.0\n", + "actual = 0.124761184535\n", + "ts max = 0.0917504268392\n" + ] + } + ], + "source": [ + "data = (firsts.totalwgt_lb.dropna(), others.totalwgt_lb.dropna())\n", + "ht = DiffMeansPermute(data)\n", + "p_value = ht.PValue(iters=1000)\n", + "print('\\nmeans permute birthweight')\n", + "print('p-value =', p_value)\n", + "print('actual =', ht.actual)\n", + "print('ts max =', ht.MaxTestStat())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this case, after 1000 attempts, we never see a sample difference as big as the observed difference, so we conclude that the apparent effect is unlikely under the null hypothesis. Under normal circumstances, we can also make the inference that the apparent effect is unlikely to be caused by random sampling.\n", + "\n", + "One final note: in this case I would report that the p-value is less than 1/1000 or 0.001. I would not report that p=0, because the apparent effect is not impossible under the null hypothesis; just unlikely." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.10" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/scipy/nsfg.py b/scipy/nsfg.py new file mode 100644 index 0000000..86a2043 --- /dev/null +++ b/scipy/nsfg.py @@ -0,0 +1,106 @@ +"""This file contains code for use with "Think Stats", +by Allen B. Downey, available from greenteapress.com + +Copyright 2010 Allen B. Downey +License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html +""" + +from __future__ import print_function + +from collections import defaultdict +import numpy as np +import sys + +import thinkstats2 + + +def ReadFemPreg(dct_file='2002FemPreg.dct', + dat_file='2002FemPreg.dat.gz'): + """Reads the NSFG pregnancy data. + + dct_file: string file name + dat_file: string file name + + returns: DataFrame + """ + dct = thinkstats2.ReadStataDct(dct_file) + df = dct.ReadFixedWidth(dat_file, compression='gzip') + CleanFemPreg(df) + return df + + +def CleanFemPreg(df): + """Recodes variables from the pregnancy frame. + + df: DataFrame + """ + # mother's age is encoded in centiyears; convert to years + df.agepreg /= 100.0 + + # birthwgt_lb contains at least one bogus value (51 lbs) + # replace with NaN + df.birthwgt_lb[df.birthwgt_lb > 20] = np.nan + + # replace 'not ascertained', 'refused', 'don't know' with NaN + na_vals = [97, 98, 99] + df.birthwgt_lb.replace(na_vals, np.nan, inplace=True) + df.birthwgt_oz.replace(na_vals, np.nan, inplace=True) + df.hpagelb.replace(na_vals, np.nan, inplace=True) + + df.babysex.replace([7, 9], np.nan, inplace=True) + df.nbrnaliv.replace([9], np.nan, inplace=True) + + # birthweight is stored in two columns, lbs and oz. + # convert to a single column in lb + # NOTE: creating a new column requires dictionary syntax, + # not attribute assignment (like df.totalwgt_lb) + df['totalwgt_lb'] = df.birthwgt_lb + df.birthwgt_oz / 16.0 + + # due to a bug in ReadStataDct, the last variable gets clipped; + # so for now set it to NaN + df.cmintvw = np.nan + + +def MakePregMap(df): + """Make a map from caseid to list of preg indices. + + df: DataFrame + + returns: dict that maps from caseid to list of indices into preg df + """ + d = defaultdict(list) + for index, caseid in df.caseid.iteritems(): + d[caseid].append(index) + return d + + +def main(script): + """Tests the functions in this module. + + script: string script name + """ + df = ReadFemPreg() + print(df.shape) + + assert len(df) == 13593 + + assert df.caseid[13592] == 12571 + assert df.pregordr.value_counts()[1] == 5033 + assert df.nbrnaliv.value_counts()[1] == 8981 + assert df.babysex.value_counts()[1] == 4641 + assert df.birthwgt_lb.value_counts()[7] == 3049 + assert df.birthwgt_oz.value_counts()[0] == 1037 + assert df.prglngth.value_counts()[39] == 4744 + assert df.outcome.value_counts()[1] == 9148 + assert df.birthord.value_counts()[1] == 4413 + assert df.agepreg.value_counts()[22.75] == 100 + assert df.totalwgt_lb.value_counts()[7.5] == 302 + + weights = df.finalwgt.value_counts() + key = max(weights.keys()) + assert df.finalwgt.value_counts()[key] == 6 + + print('%s: All tests passed.' % script) + +if __name__ == '__main__': + main(*sys.argv) diff --git a/scipy/thinkplot.py b/scipy/thinkplot.py new file mode 100644 index 0000000..3dee90a --- /dev/null +++ b/scipy/thinkplot.py @@ -0,0 +1,716 @@ +"""This file contains code for use with "Think Stats", +by Allen B. Downey, available from greenteapress.com + +Copyright 2014 Allen B. Downey +License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html +""" + +from __future__ import print_function + +import math +import matplotlib +import matplotlib.pyplot as pyplot +import numpy as np +import pandas + +import warnings + +# customize some matplotlib attributes +#matplotlib.rc('figure', figsize=(4, 3)) + +#matplotlib.rc('font', size=14.0) +#matplotlib.rc('axes', labelsize=22.0, titlesize=22.0) +#matplotlib.rc('legend', fontsize=20.0) + +#matplotlib.rc('xtick.major', size=6.0) +#matplotlib.rc('xtick.minor', size=3.0) + +#matplotlib.rc('ytick.major', size=6.0) +#matplotlib.rc('ytick.minor', size=3.0) + + +class _Brewer(object): + """Encapsulates a nice sequence of colors. + + Shades of blue that look good in color and can be distinguished + in grayscale (up to a point). + + Borrowed from http://colorbrewer2.org/ + """ + color_iter = None + + colors = ['#081D58', + '#253494', + '#225EA8', + '#1D91C0', + '#41B6C4', + '#7FCDBB', + '#C7E9B4', + '#EDF8B1', + '#FFFFD9'] + + # lists that indicate which colors to use depending on how many are used + which_colors = [[], + [1], + [1, 3], + [0, 2, 4], + [0, 2, 4, 6], + [0, 2, 3, 5, 6], + [0, 2, 3, 4, 5, 6], + [0, 1, 2, 3, 4, 5, 6], + ] + + @classmethod + def Colors(cls): + """Returns the list of colors. + """ + return cls.colors + + @classmethod + def ColorGenerator(cls, n): + """Returns an iterator of color strings. + + n: how many colors will be used + """ + for i in cls.which_colors[n]: + yield cls.colors[i] + raise StopIteration('Ran out of colors in _Brewer.ColorGenerator') + + @classmethod + def InitializeIter(cls, num): + """Initializes the color iterator with the given number of colors.""" + cls.color_iter = cls.ColorGenerator(num) + + @classmethod + def ClearIter(cls): + """Sets the color iterator to None.""" + cls.color_iter = None + + @classmethod + def GetIter(cls): + """Gets the color iterator.""" + if cls.color_iter is None: + cls.InitializeIter(7) + + return cls.color_iter + + +def PrePlot(num=None, rows=None, cols=None): + """Takes hints about what's coming. + + num: number of lines that will be plotted + rows: number of rows of subplots + cols: number of columns of subplots + """ + if num: + _Brewer.InitializeIter(num) + + if rows is None and cols is None: + return + + if rows is not None and cols is None: + cols = 1 + + if cols is not None and rows is None: + rows = 1 + + # resize the image, depending on the number of rows and cols + size_map = {(1, 1): (8, 6), + (1, 2): (14, 6), + (1, 3): (14, 6), + (2, 2): (10, 10), + (2, 3): (16, 10), + (3, 1): (8, 10), + } + + if (rows, cols) in size_map: + fig = pyplot.gcf() + fig.set_size_inches(*size_map[rows, cols]) + + # create the first subplot + if rows > 1 or cols > 1: + pyplot.subplot(rows, cols, 1) + global SUBPLOT_ROWS, SUBPLOT_COLS + SUBPLOT_ROWS = rows + SUBPLOT_COLS = cols + + +def SubPlot(plot_number, rows=None, cols=None): + """Configures the number of subplots and changes the current plot. + + rows: int + cols: int + plot_number: int + """ + rows = rows or SUBPLOT_ROWS + cols = cols or SUBPLOT_COLS + pyplot.subplot(rows, cols, plot_number) + + +def _Underride(d, **options): + """Add key-value pairs to d only if key is not in d. + + If d is None, create a new dictionary. + + d: dictionary + options: keyword args to add to d + """ + if d is None: + d = {} + + for key, val in options.items(): + d.setdefault(key, val) + + return d + + +def Clf(): + """Clears the figure and any hints that have been set.""" + global LOC + LOC = None + _Brewer.ClearIter() + pyplot.clf() + fig = pyplot.gcf() + fig.set_size_inches(8, 6) + + +def Figure(**options): + """Sets options for the current figure.""" + _Underride(options, figsize=(6, 8)) + pyplot.figure(**options) + + +def _UnderrideColor(options): + if 'color' in options: + return options + + color_iter = _Brewer.GetIter() + + if color_iter: + try: + options['color'] = next(color_iter) + except StopIteration: + # TODO: reconsider whether this should warn + # warnings.warn('Warning: Brewer ran out of colors.') + _Brewer.ClearIter() + return options + + +def Plot(obj, ys=None, style='', **options): + """Plots a line. + + Args: + obj: sequence of x values, or Series, or anything with Render() + ys: sequence of y values + style: style string passed along to pyplot.plot + options: keyword args passed to pyplot.plot + """ + options = _UnderrideColor(options) + label = getattr(obj, 'label', '_nolegend_') + options = _Underride(options, linewidth=3, alpha=0.8, label=label) + + xs = obj + if ys is None: + if hasattr(obj, 'Render'): + xs, ys = obj.Render() + if isinstance(obj, pandas.Series): + ys = obj.values + xs = obj.index + + if ys is None: + pyplot.plot(xs, style, **options) + else: + pyplot.plot(xs, ys, style, **options) + + +def FillBetween(xs, y1, y2=None, where=None, **options): + """Plots a line. + + Args: + xs: sequence of x values + y1: sequence of y values + y2: sequence of y values + where: sequence of boolean + options: keyword args passed to pyplot.fill_between + """ + options = _UnderrideColor(options) + options = _Underride(options, linewidth=0, alpha=0.5) + pyplot.fill_between(xs, y1, y2, where, **options) + + +def Bar(xs, ys, **options): + """Plots a line. + + Args: + xs: sequence of x values + ys: sequence of y values + options: keyword args passed to pyplot.bar + """ + options = _UnderrideColor(options) + options = _Underride(options, linewidth=0, alpha=0.6) + pyplot.bar(xs, ys, **options) + + +def Scatter(xs, ys=None, **options): + """Makes a scatter plot. + + xs: x values + ys: y values + options: options passed to pyplot.scatter + """ + options = _Underride(options, color='blue', alpha=0.2, + s=30, edgecolors='none') + + if ys is None and isinstance(xs, pandas.Series): + ys = xs.values + xs = xs.index + + pyplot.scatter(xs, ys, **options) + + +def HexBin(xs, ys, **options): + """Makes a scatter plot. + + xs: x values + ys: y values + options: options passed to pyplot.scatter + """ + options = _Underride(options, cmap=matplotlib.cm.Blues) + pyplot.hexbin(xs, ys, **options) + + +def Pdf(pdf, **options): + """Plots a Pdf, Pmf, or Hist as a line. + + Args: + pdf: Pdf, Pmf, or Hist object + options: keyword args passed to pyplot.plot + """ + low, high = options.pop('low', None), options.pop('high', None) + n = options.pop('n', 101) + xs, ps = pdf.Render(low=low, high=high, n=n) + options = _Underride(options, label=pdf.label) + Plot(xs, ps, **options) + + +def Pdfs(pdfs, **options): + """Plots a sequence of PDFs. + + Options are passed along for all PDFs. If you want different + options for each pdf, make multiple calls to Pdf. + + Args: + pdfs: sequence of PDF objects + options: keyword args passed to pyplot.plot + """ + for pdf in pdfs: + Pdf(pdf, **options) + + +def Hist(hist, **options): + """Plots a Pmf or Hist with a bar plot. + + The default width of the bars is based on the minimum difference + between values in the Hist. If that's too small, you can override + it by providing a width keyword argument, in the same units + as the values. + + Args: + hist: Hist or Pmf object + options: keyword args passed to pyplot.bar + """ + # find the minimum distance between adjacent values + xs, ys = hist.Render() + + if 'width' not in options: + try: + options['width'] = 0.9 * np.diff(xs).min() + except TypeError: + warnings.warn("Hist: Can't compute bar width automatically." + "Check for non-numeric types in Hist." + "Or try providing width option." + ) + + options = _Underride(options, label=hist.label) + options = _Underride(options, align='center') + if options['align'] == 'left': + options['align'] = 'edge' + elif options['align'] == 'right': + options['align'] = 'edge' + options['width'] *= -1 + + Bar(xs, ys, **options) + + +def Hists(hists, **options): + """Plots two histograms as interleaved bar plots. + + Options are passed along for all PMFs. If you want different + options for each pmf, make multiple calls to Pmf. + + Args: + hists: list of two Hist or Pmf objects + options: keyword args passed to pyplot.plot + """ + for hist in hists: + Hist(hist, **options) + + +def Pmf(pmf, **options): + """Plots a Pmf or Hist as a line. + + Args: + pmf: Hist or Pmf object + options: keyword args passed to pyplot.plot + """ + xs, ys = pmf.Render() + low, high = min(xs), max(xs) + + width = options.pop('width', None) + if width is None: + try: + width = np.diff(xs).min() + except TypeError: + warnings.warn("Pmf: Can't compute bar width automatically." + "Check for non-numeric types in Pmf." + "Or try providing width option.") + points = [] + + lastx = np.nan + lasty = 0 + for x, y in zip(xs, ys): + if (x - lastx) > 1e-5: + points.append((lastx, 0)) + points.append((x, 0)) + + points.append((x, lasty)) + points.append((x, y)) + points.append((x+width, y)) + + lastx = x + width + lasty = y + points.append((lastx, 0)) + pxs, pys = zip(*points) + + align = options.pop('align', 'center') + if align == 'center': + pxs = np.array(pxs) - width/2.0 + if align == 'right': + pxs = np.array(pxs) - width + + options = _Underride(options, label=pmf.label) + Plot(pxs, pys, **options) + + +def Pmfs(pmfs, **options): + """Plots a sequence of PMFs. + + Options are passed along for all PMFs. If you want different + options for each pmf, make multiple calls to Pmf. + + Args: + pmfs: sequence of PMF objects + options: keyword args passed to pyplot.plot + """ + for pmf in pmfs: + Pmf(pmf, **options) + + +def Diff(t): + """Compute the differences between adjacent elements in a sequence. + + Args: + t: sequence of number + + Returns: + sequence of differences (length one less than t) + """ + diffs = [t[i+1] - t[i] for i in range(len(t)-1)] + return diffs + + +def Cdf(cdf, complement=False, transform=None, **options): + """Plots a CDF as a line. + + Args: + cdf: Cdf object + complement: boolean, whether to plot the complementary CDF + transform: string, one of 'exponential', 'pareto', 'weibull', 'gumbel' + options: keyword args passed to pyplot.plot + + Returns: + dictionary with the scale options that should be passed to + Config, Show or Save. + """ + xs, ps = cdf.Render() + xs = np.asarray(xs) + ps = np.asarray(ps) + + scale = dict(xscale='linear', yscale='linear') + + for s in ['xscale', 'yscale']: + if s in options: + scale[s] = options.pop(s) + + if transform == 'exponential': + complement = True + scale['yscale'] = 'log' + + if transform == 'pareto': + complement = True + scale['yscale'] = 'log' + scale['xscale'] = 'log' + + if complement: + ps = [1.0-p for p in ps] + + if transform == 'weibull': + xs = np.delete(xs, -1) + ps = np.delete(ps, -1) + ps = [-math.log(1.0-p) for p in ps] + scale['xscale'] = 'log' + scale['yscale'] = 'log' + + if transform == 'gumbel': + xs = xp.delete(xs, 0) + ps = np.delete(ps, 0) + ps = [-math.log(p) for p in ps] + scale['yscale'] = 'log' + + options = _Underride(options, label=cdf.label) + Plot(xs, ps, **options) + return scale + + +def Cdfs(cdfs, complement=False, transform=None, **options): + """Plots a sequence of CDFs. + + cdfs: sequence of CDF objects + complement: boolean, whether to plot the complementary CDF + transform: string, one of 'exponential', 'pareto', 'weibull', 'gumbel' + options: keyword args passed to pyplot.plot + """ + for cdf in cdfs: + Cdf(cdf, complement, transform, **options) + + +def Contour(obj, pcolor=False, contour=True, imshow=False, **options): + """Makes a contour plot. + + d: map from (x, y) to z, or object that provides GetDict + pcolor: boolean, whether to make a pseudocolor plot + contour: boolean, whether to make a contour plot + imshow: boolean, whether to use pyplot.imshow + options: keyword args passed to pyplot.pcolor and/or pyplot.contour + """ + try: + d = obj.GetDict() + except AttributeError: + d = obj + + _Underride(options, linewidth=3, cmap=matplotlib.cm.Blues) + + xs, ys = zip(*d.keys()) + xs = sorted(set(xs)) + ys = sorted(set(ys)) + + X, Y = np.meshgrid(xs, ys) + func = lambda x, y: d.get((x, y), 0) + func = np.vectorize(func) + Z = func(X, Y) + + x_formatter = matplotlib.ticker.ScalarFormatter(useOffset=False) + axes = pyplot.gca() + axes.xaxis.set_major_formatter(x_formatter) + + if pcolor: + pyplot.pcolormesh(X, Y, Z, **options) + if contour: + cs = pyplot.contour(X, Y, Z, **options) + pyplot.clabel(cs, inline=1, fontsize=10) + if imshow: + extent = xs[0], xs[-1], ys[0], ys[-1] + pyplot.imshow(Z, extent=extent, **options) + + +def Pcolor(xs, ys, zs, pcolor=True, contour=False, **options): + """Makes a pseudocolor plot. + + xs: + ys: + zs: + pcolor: boolean, whether to make a pseudocolor plot + contour: boolean, whether to make a contour plot + options: keyword args passed to pyplot.pcolor and/or pyplot.contour + """ + _Underride(options, linewidth=3, cmap=matplotlib.cm.Blues) + + X, Y = np.meshgrid(xs, ys) + Z = zs + + x_formatter = matplotlib.ticker.ScalarFormatter(useOffset=False) + axes = pyplot.gca() + axes.xaxis.set_major_formatter(x_formatter) + + if pcolor: + pyplot.pcolormesh(X, Y, Z, **options) + + if contour: + cs = pyplot.contour(X, Y, Z, **options) + pyplot.clabel(cs, inline=1, fontsize=10) + + +def Text(x, y, s, **options): + """Puts text in a figure. + + x: number + y: number + s: string + options: keyword args passed to pyplot.text + """ + options = _Underride(options, + fontsize=16, + verticalalignment='top', + horizontalalignment='left') + pyplot.text(x, y, s, **options) + + +LEGEND = True +LOC = None + +def Config(**options): + """Configures the plot. + + Pulls options out of the option dictionary and passes them to + the corresponding pyplot functions. + """ + names = ['title', 'xlabel', 'ylabel', 'xscale', 'yscale', + 'xticks', 'yticks', 'axis', 'xlim', 'ylim'] + + for name in names: + if name in options: + getattr(pyplot, name)(options[name]) + + # looks like this is not necessary: matplotlib understands text loc specs + loc_dict = {'upper right': 1, + 'upper left': 2, + 'lower left': 3, + 'lower right': 4, + 'right': 5, + 'center left': 6, + 'center right': 7, + 'lower center': 8, + 'upper center': 9, + 'center': 10, + } + + global LEGEND + LEGEND = options.get('legend', LEGEND) + + if LEGEND: + global LOC + LOC = options.get('loc', LOC) + pyplot.legend(loc=LOC) + + +def Show(**options): + """Shows the plot. + + For options, see Config. + + options: keyword args used to invoke various pyplot functions + """ + clf = options.pop('clf', True) + Config(**options) + pyplot.show() + if clf: + Clf() + + +def Plotly(**options): + """Shows the plot. + + For options, see Config. + + options: keyword args used to invoke various pyplot functions + """ + clf = options.pop('clf', True) + Config(**options) + import plotly.plotly as plotly + url = plotly.plot_mpl(pyplot.gcf()) + if clf: + Clf() + return url + + +def Save(root=None, formats=None, **options): + """Saves the plot in the given formats and clears the figure. + + For options, see Config. + + Args: + root: string filename root + formats: list of string formats + options: keyword args used to invoke various pyplot functions + """ + clf = options.pop('clf', True) + Config(**options) + + if formats is None: + formats = ['pdf', 'eps'] + + try: + formats.remove('plotly') + Plotly(clf=False) + except ValueError: + pass + + if root: + for fmt in formats: + SaveFormat(root, fmt) + if clf: + Clf() + + +def SaveFormat(root, fmt='eps'): + """Writes the current figure to a file in the given format. + + Args: + root: string filename root + fmt: string format + """ + filename = '%s.%s' % (root, fmt) + print('Writing', filename) + pyplot.savefig(filename, format=fmt, dpi=300) + + +# provide aliases for calling functons with lower-case names +preplot = PrePlot +subplot = SubPlot +clf = Clf +figure = Figure +plot = Plot +text = Text +scatter = Scatter +pmf = Pmf +pmfs = Pmfs +hist = Hist +hists = Hists +diff = Diff +cdf = Cdf +cdfs = Cdfs +contour = Contour +pcolor = Pcolor +config = Config +show = Show +save = Save + + +def main(): + color_iter = _Brewer.ColorGenerator(7) + for color in color_iter: + print(color) + + +if __name__ == '__main__': + main() diff --git a/scipy/thinkstats2.py b/scipy/thinkstats2.py new file mode 100644 index 0000000..5141f5d --- /dev/null +++ b/scipy/thinkstats2.py @@ -0,0 +1,2801 @@ +"""This file contains code for use with "Think Stats" and +"Think Bayes", both by Allen B. Downey, available from greenteapress.com + +Copyright 2014 Allen B. Downey +License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html +""" + +from __future__ import print_function, division + +"""This file contains class definitions for: + +Hist: represents a histogram (map from values to integer frequencies). + +Pmf: represents a probability mass function (map from values to probs). + +_DictWrapper: private parent class for Hist and Pmf. + +Cdf: represents a discrete cumulative distribution function + +Pdf: represents a continuous probability density function + +""" + +import bisect +import copy +import logging +import math +import random +import re + +from collections import Counter +from operator import itemgetter + +import thinkplot + +import numpy as np +import pandas + +import scipy +from scipy import stats +from scipy import special +from scipy import ndimage + +from io import open + +ROOT2 = math.sqrt(2) + +def RandomSeed(x): + """Initialize the random and np.random generators. + + x: int seed + """ + random.seed(x) + np.random.seed(x) + + +def Odds(p): + """Computes odds for a given probability. + + Example: p=0.75 means 75 for and 25 against, or 3:1 odds in favor. + + Note: when p=1, the formula for odds divides by zero, which is + normally undefined. But I think it is reasonable to define Odds(1) + to be infinity, so that's what this function does. + + p: float 0-1 + + Returns: float odds + """ + if p == 1: + return float('inf') + return p / (1 - p) + + +def Probability(o): + """Computes the probability corresponding to given odds. + + Example: o=2 means 2:1 odds in favor, or 2/3 probability + + o: float odds, strictly positive + + Returns: float probability + """ + return o / (o + 1) + + +def Probability2(yes, no): + """Computes the probability corresponding to given odds. + + Example: yes=2, no=1 means 2:1 odds in favor, or 2/3 probability. + + yes, no: int or float odds in favor + """ + return yes / (yes + no) + + +class Interpolator(object): + """Represents a mapping between sorted sequences; performs linear interp. + + Attributes: + xs: sorted list + ys: sorted list + """ + + def __init__(self, xs, ys): + self.xs = xs + self.ys = ys + + def Lookup(self, x): + """Looks up x and returns the corresponding value of y.""" + return self._Bisect(x, self.xs, self.ys) + + def Reverse(self, y): + """Looks up y and returns the corresponding value of x.""" + return self._Bisect(y, self.ys, self.xs) + + def _Bisect(self, x, xs, ys): + """Helper function.""" + if x <= xs[0]: + return ys[0] + if x >= xs[-1]: + return ys[-1] + i = bisect.bisect(xs, x) + frac = 1.0 * (x - xs[i - 1]) / (xs[i] - xs[i - 1]) + y = ys[i - 1] + frac * 1.0 * (ys[i] - ys[i - 1]) + return y + + +class _DictWrapper(object): + """An object that contains a dictionary.""" + + def __init__(self, obj=None, label=None): + """Initializes the distribution. + + obj: Hist, Pmf, Cdf, Pdf, dict, pandas Series, list of pairs + label: string label + """ + self.label = label if label is not None else '_nolegend_' + self.d = {} + + # flag whether the distribution is under a log transform + self.log = False + + if obj is None: + return + + if isinstance(obj, (_DictWrapper, Cdf, Pdf)): + self.label = label if label is not None else obj.label + + if isinstance(obj, dict): + self.d.update(obj.items()) + elif isinstance(obj, (_DictWrapper, Cdf, Pdf)): + self.d.update(obj.Items()) + elif isinstance(obj, pandas.Series): + self.d.update(obj.value_counts().iteritems()) + else: + # finally, treat it like a list + self.d.update(Counter(obj)) + + if len(self) > 0 and isinstance(self, Pmf): + self.Normalize() + + def __hash__(self): + return id(self) + + def __str__(self): + cls = self.__class__.__name__ + return '%s(%s)' % (cls, str(self.d)) + + __repr__ = __str__ + + def __eq__(self, other): + return self.d == other.d + + def __len__(self): + return len(self.d) + + def __iter__(self): + return iter(self.d) + + def iterkeys(self): + """Returns an iterator over keys.""" + return iter(self.d) + + def __contains__(self, value): + return value in self.d + + def __getitem__(self, value): + return self.d.get(value, 0) + + def __setitem__(self, value, prob): + self.d[value] = prob + + def __delitem__(self, value): + del self.d[value] + + def Copy(self, label=None): + """Returns a copy. + + Make a shallow copy of d. If you want a deep copy of d, + use copy.deepcopy on the whole object. + + label: string label for the new Hist + + returns: new _DictWrapper with the same type + """ + new = copy.copy(self) + new.d = copy.copy(self.d) + new.label = label if label is not None else self.label + return new + + def Scale(self, factor): + """Multiplies the values by a factor. + + factor: what to multiply by + + Returns: new object + """ + new = self.Copy() + new.d.clear() + + for val, prob in self.Items(): + new.Set(val * factor, prob) + return new + + def Log(self, m=None): + """Log transforms the probabilities. + + Removes values with probability 0. + + Normalizes so that the largest logprob is 0. + """ + if self.log: + raise ValueError("Pmf/Hist already under a log transform") + self.log = True + + if m is None: + m = self.MaxLike() + + for x, p in self.d.items(): + if p: + self.Set(x, math.log(p / m)) + else: + self.Remove(x) + + def Exp(self, m=None): + """Exponentiates the probabilities. + + m: how much to shift the ps before exponentiating + + If m is None, normalizes so that the largest prob is 1. + """ + if not self.log: + raise ValueError("Pmf/Hist not under a log transform") + self.log = False + + if m is None: + m = self.MaxLike() + + for x, p in self.d.items(): + self.Set(x, math.exp(p - m)) + + def GetDict(self): + """Gets the dictionary.""" + return self.d + + def SetDict(self, d): + """Sets the dictionary.""" + self.d = d + + def Values(self): + """Gets an unsorted sequence of values. + + Note: one source of confusion is that the keys of this + dictionary are the values of the Hist/Pmf, and the + values of the dictionary are frequencies/probabilities. + """ + return self.d.keys() + + def Items(self): + """Gets an unsorted sequence of (value, freq/prob) pairs.""" + return self.d.items() + + def Render(self, **options): + """Generates a sequence of points suitable for plotting. + + Note: options are ignored + + Returns: + tuple of (sorted value sequence, freq/prob sequence) + """ + if min(self.d.keys()) is np.nan: + logging.warning('Hist: contains NaN, may not render correctly.') + + return zip(*sorted(self.Items())) + + def MakeCdf(self, label=None): + """Makes a Cdf.""" + label = label if label is not None else self.label + return Cdf(self, label=label) + + def Print(self): + """Prints the values and freqs/probs in ascending order.""" + for val, prob in sorted(self.d.items()): + print(val, prob) + + def Set(self, x, y=0): + """Sets the freq/prob associated with the value x. + + Args: + x: number value + y: number freq or prob + """ + self.d[x] = y + + def Incr(self, x, term=1): + """Increments the freq/prob associated with the value x. + + Args: + x: number value + term: how much to increment by + """ + self.d[x] = self.d.get(x, 0) + term + + def Mult(self, x, factor): + """Scales the freq/prob associated with the value x. + + Args: + x: number value + factor: how much to multiply by + """ + self.d[x] = self.d.get(x, 0) * factor + + def Remove(self, x): + """Removes a value. + + Throws an exception if the value is not there. + + Args: + x: value to remove + """ + del self.d[x] + + def Total(self): + """Returns the total of the frequencies/probabilities in the map.""" + total = sum(self.d.values()) + return total + + def MaxLike(self): + """Returns the largest frequency/probability in the map.""" + return max(self.d.values()) + + def Largest(self, n=10): + """Returns the largest n values, with frequency/probability. + + n: number of items to return + """ + return sorted(self.d.items(), reverse=True)[:n] + + def Smallest(self, n=10): + """Returns the smallest n values, with frequency/probability. + + n: number of items to return + """ + return sorted(self.d.items(), reverse=False)[:n] + + +class Hist(_DictWrapper): + """Represents a histogram, which is a map from values to frequencies. + + Values can be any hashable type; frequencies are integer counters. + """ + def Freq(self, x): + """Gets the frequency associated with the value x. + + Args: + x: number value + + Returns: + int frequency + """ + return self.d.get(x, 0) + + def Freqs(self, xs): + """Gets frequencies for a sequence of values.""" + return [self.Freq(x) for x in xs] + + def IsSubset(self, other): + """Checks whether the values in this histogram are a subset of + the values in the given histogram.""" + for val, freq in self.Items(): + if freq > other.Freq(val): + return False + return True + + def Subtract(self, other): + """Subtracts the values in the given histogram from this histogram.""" + for val, freq in other.Items(): + self.Incr(val, -freq) + + +class Pmf(_DictWrapper): + """Represents a probability mass function. + + Values can be any hashable type; probabilities are floating-point. + Pmfs are not necessarily normalized. + """ + + def Prob(self, x, default=0): + """Gets the probability associated with the value x. + + Args: + x: number value + default: value to return if the key is not there + + Returns: + float probability + """ + return self.d.get(x, default) + + def Probs(self, xs): + """Gets probabilities for a sequence of values.""" + return [self.Prob(x) for x in xs] + + def Percentile(self, percentage): + """Computes a percentile of a given Pmf. + + Note: this is not super efficient. If you are planning + to compute more than a few percentiles, compute the Cdf. + + percentage: float 0-100 + + returns: value from the Pmf + """ + p = percentage / 100.0 + total = 0 + for val, prob in sorted(self.Items()): + total += prob + if total >= p: + return val + + def ProbGreater(self, x): + """Probability that a sample from this Pmf exceeds x. + + x: number + + returns: float probability + """ + if isinstance(x, _DictWrapper): + return PmfProbGreater(self, x) + else: + t = [prob for (val, prob) in self.d.items() if val > x] + return sum(t) + + def ProbLess(self, x): + """Probability that a sample from this Pmf is less than x. + + x: number + + returns: float probability + """ + if isinstance(x, _DictWrapper): + return PmfProbLess(self, x) + else: + t = [prob for (val, prob) in self.d.items() if val < x] + return sum(t) + + def __lt__(self, obj): + """Less than. + + obj: number or _DictWrapper + + returns: float probability + """ + return self.ProbLess(obj) + + def __gt__(self, obj): + """Greater than. + + obj: number or _DictWrapper + + returns: float probability + """ + return self.ProbGreater(obj) + + def __ge__(self, obj): + """Greater than or equal. + + obj: number or _DictWrapper + + returns: float probability + """ + return 1 - (self < obj) + + def __le__(self, obj): + """Less than or equal. + + obj: number or _DictWrapper + + returns: float probability + """ + return 1 - (self > obj) + + def Normalize(self, fraction=1.0): + """Normalizes this PMF so the sum of all probs is fraction. + + Args: + fraction: what the total should be after normalization + + Returns: the total probability before normalizing + """ + if self.log: + raise ValueError("Normalize: Pmf is under a log transform") + + total = self.Total() + if total == 0.0: + raise ValueError('Normalize: total probability is zero.') + #logging.warning('Normalize: total probability is zero.') + #return total + + factor = fraction / total + for x in self.d: + self.d[x] *= factor + + return total + + def Random(self): + """Chooses a random element from this PMF. + + Note: this is not very efficient. If you plan to call + this more than a few times, consider converting to a CDF. + + Returns: + float value from the Pmf + """ + target = random.random() + total = 0.0 + for x, p in self.d.items(): + total += p + if total >= target: + return x + + # we shouldn't get here + raise ValueError('Random: Pmf might not be normalized.') + + def Mean(self): + """Computes the mean of a PMF. + + Returns: + float mean + """ + mean = 0.0 + for x, p in self.d.items(): + mean += p * x + return mean + + def Var(self, mu=None): + """Computes the variance of a PMF. + + mu: the point around which the variance is computed; + if omitted, computes the mean + + returns: float variance + """ + if mu is None: + mu = self.Mean() + + var = 0.0 + for x, p in self.d.items(): + var += p * (x - mu) ** 2 + return var + + def Std(self, mu=None): + """Computes the standard deviation of a PMF. + + mu: the point around which the variance is computed; + if omitted, computes the mean + + returns: float standard deviation + """ + var = self.Var(mu) + return math.sqrt(var) + + def MaximumLikelihood(self): + """Returns the value with the highest probability. + + Returns: float probability + """ + _, val = max((prob, val) for val, prob in self.Items()) + return val + + def CredibleInterval(self, percentage=90): + """Computes the central credible interval. + + If percentage=90, computes the 90% CI. + + Args: + percentage: float between 0 and 100 + + Returns: + sequence of two floats, low and high + """ + cdf = self.MakeCdf() + return cdf.CredibleInterval(percentage) + + def __add__(self, other): + """Computes the Pmf of the sum of values drawn from self and other. + + other: another Pmf or a scalar + + returns: new Pmf + """ + try: + return self.AddPmf(other) + except AttributeError: + return self.AddConstant(other) + + def AddPmf(self, other): + """Computes the Pmf of the sum of values drawn from self and other. + + other: another Pmf + + returns: new Pmf + """ + pmf = Pmf() + for v1, p1 in self.Items(): + for v2, p2 in other.Items(): + pmf.Incr(v1 + v2, p1 * p2) + return pmf + + def AddConstant(self, other): + """Computes the Pmf of the sum a constant and values from self. + + other: a number + + returns: new Pmf + """ + pmf = Pmf() + for v1, p1 in self.Items(): + pmf.Set(v1 + other, p1) + return pmf + + def __sub__(self, other): + """Computes the Pmf of the diff of values drawn from self and other. + + other: another Pmf + + returns: new Pmf + """ + try: + return self.SubPmf(other) + except AttributeError: + return self.AddConstant(-other) + + def SubPmf(self, other): + """Computes the Pmf of the diff of values drawn from self and other. + + other: another Pmf + + returns: new Pmf + """ + pmf = Pmf() + for v1, p1 in self.Items(): + for v2, p2 in other.Items(): + pmf.Incr(v1 - v2, p1 * p2) + return pmf + + def __mul__(self, other): + """Computes the Pmf of the product of values drawn from self and other. + + other: another Pmf + + returns: new Pmf + """ + try: + return self.MulPmf(other) + except AttributeError: + return self.MulConstant(other) + + def MulPmf(self, other): + """Computes the Pmf of the diff of values drawn from self and other. + + other: another Pmf + + returns: new Pmf + """ + pmf = Pmf() + for v1, p1 in self.Items(): + for v2, p2 in other.Items(): + pmf.Incr(v1 * v2, p1 * p2) + return pmf + + def MulConstant(self, other): + """Computes the Pmf of the product of a constant and values from self. + + other: a number + + returns: new Pmf + """ + pmf = Pmf() + for v1, p1 in self.Items(): + pmf.Set(v1 * other, p1) + return pmf + + def __div__(self, other): + """Computes the Pmf of the ratio of values drawn from self and other. + + other: another Pmf + + returns: new Pmf + """ + try: + return self.DivPmf(other) + except AttributeError: + return self.MulConstant(1/other) + + __truediv__ = __div__ + + def DivPmf(self, other): + """Computes the Pmf of the ratio of values drawn from self and other. + + other: another Pmf + + returns: new Pmf + """ + pmf = Pmf() + for v1, p1 in self.Items(): + for v2, p2 in other.Items(): + pmf.Incr(v1 / v2, p1 * p2) + return pmf + + def Max(self, k): + """Computes the CDF of the maximum of k selections from this dist. + + k: int + + returns: new Cdf + """ + cdf = self.MakeCdf() + return cdf.Max(k) + + +class Joint(Pmf): + """Represents a joint distribution. + + The values are sequences (usually tuples) + """ + + def Marginal(self, i, label=None): + """Gets the marginal distribution of the indicated variable. + + i: index of the variable we want + + Returns: Pmf + """ + pmf = Pmf(label=label) + for vs, prob in self.Items(): + pmf.Incr(vs[i], prob) + return pmf + + def Conditional(self, i, j, val, label=None): + """Gets the conditional distribution of the indicated variable. + + Distribution of vs[i], conditioned on vs[j] = val. + + i: index of the variable we want + j: which variable is conditioned on + val: the value the jth variable has to have + + Returns: Pmf + """ + pmf = Pmf(label=label) + for vs, prob in self.Items(): + if vs[j] != val: + continue + pmf.Incr(vs[i], prob) + + pmf.Normalize() + return pmf + + def MaxLikeInterval(self, percentage=90): + """Returns the maximum-likelihood credible interval. + + If percentage=90, computes a 90% CI containing the values + with the highest likelihoods. + + percentage: float between 0 and 100 + + Returns: list of values from the suite + """ + interval = [] + total = 0 + + t = [(prob, val) for val, prob in self.Items()] + t.sort(reverse=True) + + for prob, val in t: + interval.append(val) + total += prob + if total >= percentage / 100.0: + break + + return interval + + +def MakeJoint(pmf1, pmf2): + """Joint distribution of values from pmf1 and pmf2. + + Assumes that the PMFs represent independent random variables. + + Args: + pmf1: Pmf object + pmf2: Pmf object + + Returns: + Joint pmf of value pairs + """ + joint = Joint() + for v1, p1 in pmf1.Items(): + for v2, p2 in pmf2.Items(): + joint.Set((v1, v2), p1 * p2) + return joint + + +def MakeHistFromList(t, label=None): + """Makes a histogram from an unsorted sequence of values. + + Args: + t: sequence of numbers + label: string label for this histogram + + Returns: + Hist object + """ + return Hist(t, label=label) + + +def MakeHistFromDict(d, label=None): + """Makes a histogram from a map from values to frequencies. + + Args: + d: dictionary that maps values to frequencies + label: string label for this histogram + + Returns: + Hist object + """ + return Hist(d, label) + + +def MakePmfFromList(t, label=None): + """Makes a PMF from an unsorted sequence of values. + + Args: + t: sequence of numbers + label: string label for this PMF + + Returns: + Pmf object + """ + return Pmf(t, label=label) + + +def MakePmfFromDict(d, label=None): + """Makes a PMF from a map from values to probabilities. + + Args: + d: dictionary that maps values to probabilities + label: string label for this PMF + + Returns: + Pmf object + """ + return Pmf(d, label=label) + + +def MakePmfFromItems(t, label=None): + """Makes a PMF from a sequence of value-probability pairs + + Args: + t: sequence of value-probability pairs + label: string label for this PMF + + Returns: + Pmf object + """ + return Pmf(dict(t), label=label) + + +def MakePmfFromHist(hist, label=None): + """Makes a normalized PMF from a Hist object. + + Args: + hist: Hist object + label: string label + + Returns: + Pmf object + """ + if label is None: + label = hist.label + + return Pmf(hist, label=label) + + +def MakeMixture(metapmf, label='mix'): + """Make a mixture distribution. + + Args: + metapmf: Pmf that maps from Pmfs to probs. + label: string label for the new Pmf. + + Returns: Pmf object. + """ + mix = Pmf(label=label) + for pmf, p1 in metapmf.Items(): + for x, p2 in pmf.Items(): + mix.Incr(x, p1 * p2) + return mix + + +def MakeUniformPmf(low, high, n): + """Make a uniform Pmf. + + low: lowest value (inclusive) + high: highest value (inclusize) + n: number of values + """ + pmf = Pmf() + for x in np.linspace(low, high, n): + pmf.Set(x, 1) + pmf.Normalize() + return pmf + + +class Cdf(object): + """Represents a cumulative distribution function. + + Attributes: + xs: sequence of values + ps: sequence of probabilities + label: string used as a graph label. + """ + def __init__(self, obj=None, ps=None, label=None): + """Initializes. + + If ps is provided, obj must be the corresponding list of values. + + obj: Hist, Pmf, Cdf, Pdf, dict, pandas Series, list of pairs + ps: list of cumulative probabilities + label: string label + """ + self.label = label if label is not None else '_nolegend_' + + if isinstance(obj, (_DictWrapper, Cdf, Pdf)): + if not label: + self.label = label if label is not None else obj.label + + if obj is None: + # caller does not provide obj, make an empty Cdf + self.xs = np.asarray([]) + self.ps = np.asarray([]) + if ps is not None: + logging.warning("Cdf: can't pass ps without also passing xs.") + return + else: + # if the caller provides xs and ps, just store them + if ps is not None: + if isinstance(ps, str): + logging.warning("Cdf: ps can't be a string") + + self.xs = np.asarray(obj) + self.ps = np.asarray(ps) + return + + # caller has provided just obj, not ps + if isinstance(obj, Cdf): + self.xs = copy.copy(obj.xs) + self.ps = copy.copy(obj.ps) + return + + if isinstance(obj, _DictWrapper): + dw = obj + else: + dw = Hist(obj) + + if len(dw) == 0: + self.xs = np.asarray([]) + self.ps = np.asarray([]) + return + + xs, freqs = zip(*sorted(dw.Items())) + self.xs = np.asarray(xs) + self.ps = np.cumsum(freqs, dtype=np.float) + self.ps /= self.ps[-1] + + def __str__(self): + return 'Cdf(%s, %s)' % (str(self.xs), str(self.ps)) + + __repr__ = __str__ + + def __len__(self): + return len(self.xs) + + def __getitem__(self, x): + return self.Prob(x) + + def __setitem__(self): + raise UnimplementedMethodException() + + def __delitem__(self): + raise UnimplementedMethodException() + + def __eq__(self, other): + return np.all(self.xs == other.xs) and np.all(self.ps == other.ps) + + def Copy(self, label=None): + """Returns a copy of this Cdf. + + label: string label for the new Cdf + """ + if label is None: + label = self.label + return Cdf(list(self.xs), list(self.ps), label=label) + + def MakePmf(self, label=None): + """Makes a Pmf.""" + if label is None: + label = self.label + return Pmf(self, label=label) + + def Values(self): + """Returns a sorted list of values. + """ + return self.xs + + def Items(self): + """Returns a sorted sequence of (value, probability) pairs. + + Note: in Python3, returns an iterator. + """ + a = self.ps + b = np.roll(a, 1) + b[0] = 0 + return zip(self.xs, a-b) + + def Shift(self, term): + """Adds a term to the xs. + + term: how much to add + """ + new = self.Copy() + # don't use +=, or else an int array + float yields int array + new.xs = new.xs + term + return new + + def Scale(self, factor): + """Multiplies the xs by a factor. + + factor: what to multiply by + """ + new = self.Copy() + # don't use *=, or else an int array * float yields int array + new.xs = new.xs * factor + return new + + def Prob(self, x): + """Returns CDF(x), the probability that corresponds to value x. + + Args: + x: number + + Returns: + float probability + """ + if x < self.xs[0]: + return 0.0 + index = bisect.bisect(self.xs, x) + p = self.ps[index-1] + return p + + def Probs(self, xs): + """Gets probabilities for a sequence of values. + + xs: any sequence that can be converted to NumPy array + + returns: NumPy array of cumulative probabilities + """ + xs = np.asarray(xs) + index = np.searchsorted(self.xs, xs, side='right') + ps = self.ps[index-1] + ps[xs < self.xs[0]] = 0.0 + return ps + + ProbArray = Probs + + def Value(self, p): + """Returns InverseCDF(p), the value that corresponds to probability p. + + Args: + p: number in the range [0, 1] + + Returns: + number value + """ + if p < 0 or p > 1: + raise ValueError('Probability p must be in range [0, 1]') + + index = bisect.bisect_left(self.ps, p) + return self.xs[index] + + def ValueArray(self, ps): + """Returns InverseCDF(p), the value that corresponds to probability p. + + Args: + ps: NumPy array of numbers in the range [0, 1] + + Returns: + NumPy array of values + """ + ps = np.asarray(ps) + if np.any(ps < 0) or np.any(ps > 1): + raise ValueError('Probability p must be in range [0, 1]') + + index = np.searchsorted(self.ps, ps, side='left') + return self.xs[index] + + def Percentile(self, p): + """Returns the value that corresponds to percentile p. + + Args: + p: number in the range [0, 100] + + Returns: + number value + """ + return self.Value(p / 100.0) + + def PercentileRank(self, x): + """Returns the percentile rank of the value x. + + x: potential value in the CDF + + returns: percentile rank in the range 0 to 100 + """ + return self.Prob(x) * 100.0 + + def Random(self): + """Chooses a random value from this distribution.""" + return self.Value(random.random()) + + def Sample(self, n): + """Generates a random sample from this distribution. + + n: int length of the sample + returns: NumPy array + """ + ps = np.random.random(n) + return self.ValueArray(ps) + + def Mean(self): + """Computes the mean of a CDF. + + Returns: + float mean + """ + old_p = 0 + total = 0.0 + for x, new_p in zip(self.xs, self.ps): + p = new_p - old_p + total += p * x + old_p = new_p + return total + + def CredibleInterval(self, percentage=90): + """Computes the central credible interval. + + If percentage=90, computes the 90% CI. + + Args: + percentage: float between 0 and 100 + + Returns: + sequence of two floats, low and high + """ + prob = (1 - percentage / 100.0) / 2 + interval = self.Value(prob), self.Value(1 - prob) + return interval + + ConfidenceInterval = CredibleInterval + + def _Round(self, multiplier=1000.0): + """ + An entry is added to the cdf only if the percentile differs + from the previous value in a significant digit, where the number + of significant digits is determined by multiplier. The + default is 1000, which keeps log10(1000) = 3 significant digits. + """ + # TODO(write this method) + raise UnimplementedMethodException() + + def Render(self, **options): + """Generates a sequence of points suitable for plotting. + + An empirical CDF is a step function; linear interpolation + can be misleading. + + Note: options are ignored + + Returns: + tuple of (xs, ps) + """ + def interleave(a, b): + c = np.empty(a.shape[0] + b.shape[0]) + c[::2] = a + c[1::2] = b + return c + + a = np.array(self.xs) + xs = interleave(a, a) + shift_ps = np.roll(self.ps, 1) + shift_ps[0] = 0 + ps = interleave(shift_ps, self.ps) + return xs, ps + + def Max(self, k): + """Computes the CDF of the maximum of k selections from this dist. + + k: int + + returns: new Cdf + """ + cdf = self.Copy() + cdf.ps **= k + return cdf + + +def MakeCdfFromItems(items, label=None): + """Makes a cdf from an unsorted sequence of (value, frequency) pairs. + + Args: + items: unsorted sequence of (value, frequency) pairs + label: string label for this CDF + + Returns: + cdf: list of (value, fraction) pairs + """ + return Cdf(dict(items), label=label) + + +def MakeCdfFromDict(d, label=None): + """Makes a CDF from a dictionary that maps values to frequencies. + + Args: + d: dictionary that maps values to frequencies. + label: string label for the data. + + Returns: + Cdf object + """ + return Cdf(d, label=label) + + +def MakeCdfFromList(seq, label=None): + """Creates a CDF from an unsorted sequence. + + Args: + seq: unsorted sequence of sortable values + label: string label for the cdf + + Returns: + Cdf object + """ + return Cdf(seq, label=label) + + +def MakeCdfFromHist(hist, label=None): + """Makes a CDF from a Hist object. + + Args: + hist: Pmf.Hist object + label: string label for the data. + + Returns: + Cdf object + """ + if label is None: + label = hist.label + + return Cdf(hist, label=label) + + +def MakeCdfFromPmf(pmf, label=None): + """Makes a CDF from a Pmf object. + + Args: + pmf: Pmf.Pmf object + label: string label for the data. + + Returns: + Cdf object + """ + if label is None: + label = pmf.label + + return Cdf(pmf, label=label) + + +class UnimplementedMethodException(Exception): + """Exception if someone calls a method that should be overridden.""" + + +class Suite(Pmf): + """Represents a suite of hypotheses and their probabilities.""" + + def Update(self, data): + """Updates each hypothesis based on the data. + + data: any representation of the data + + returns: the normalizing constant + """ + for hypo in self.Values(): + like = self.Likelihood(data, hypo) + self.Mult(hypo, like) + return self.Normalize() + + def LogUpdate(self, data): + """Updates a suite of hypotheses based on new data. + + Modifies the suite directly; if you want to keep the original, make + a copy. + + Note: unlike Update, LogUpdate does not normalize. + + Args: + data: any representation of the data + """ + for hypo in self.Values(): + like = self.LogLikelihood(data, hypo) + self.Incr(hypo, like) + + def UpdateSet(self, dataset): + """Updates each hypothesis based on the dataset. + + This is more efficient than calling Update repeatedly because + it waits until the end to Normalize. + + Modifies the suite directly; if you want to keep the original, make + a copy. + + dataset: a sequence of data + + returns: the normalizing constant + """ + for data in dataset: + for hypo in self.Values(): + like = self.Likelihood(data, hypo) + self.Mult(hypo, like) + return self.Normalize() + + def LogUpdateSet(self, dataset): + """Updates each hypothesis based on the dataset. + + Modifies the suite directly; if you want to keep the original, make + a copy. + + dataset: a sequence of data + + returns: None + """ + for data in dataset: + self.LogUpdate(data) + + def Likelihood(self, data, hypo): + """Computes the likelihood of the data under the hypothesis. + + hypo: some representation of the hypothesis + data: some representation of the data + """ + raise UnimplementedMethodException() + + def LogLikelihood(self, data, hypo): + """Computes the log likelihood of the data under the hypothesis. + + hypo: some representation of the hypothesis + data: some representation of the data + """ + raise UnimplementedMethodException() + + def Print(self): + """Prints the hypotheses and their probabilities.""" + for hypo, prob in sorted(self.Items()): + print(hypo, prob) + + def MakeOdds(self): + """Transforms from probabilities to odds. + + Values with prob=0 are removed. + """ + for hypo, prob in self.Items(): + if prob: + self.Set(hypo, Odds(prob)) + else: + self.Remove(hypo) + + def MakeProbs(self): + """Transforms from odds to probabilities.""" + for hypo, odds in self.Items(): + self.Set(hypo, Probability(odds)) + + +def MakeSuiteFromList(t, label=None): + """Makes a suite from an unsorted sequence of values. + + Args: + t: sequence of numbers + label: string label for this suite + + Returns: + Suite object + """ + hist = MakeHistFromList(t, label=label) + d = hist.GetDict() + return MakeSuiteFromDict(d) + + +def MakeSuiteFromHist(hist, label=None): + """Makes a normalized suite from a Hist object. + + Args: + hist: Hist object + label: string label + + Returns: + Suite object + """ + if label is None: + label = hist.label + + # make a copy of the dictionary + d = dict(hist.GetDict()) + return MakeSuiteFromDict(d, label) + + +def MakeSuiteFromDict(d, label=None): + """Makes a suite from a map from values to probabilities. + + Args: + d: dictionary that maps values to probabilities + label: string label for this suite + + Returns: + Suite object + """ + suite = Suite(label=label) + suite.SetDict(d) + suite.Normalize() + return suite + + +class Pdf(object): + """Represents a probability density function (PDF).""" + + def Density(self, x): + """Evaluates this Pdf at x. + + Returns: float or NumPy array of probability density + """ + raise UnimplementedMethodException() + + def GetLinspace(self): + """Get a linspace for plotting. + + Not all subclasses of Pdf implement this. + + Returns: numpy array + """ + raise UnimplementedMethodException() + + def MakePmf(self, **options): + """Makes a discrete version of this Pdf. + + options can include + label: string + low: low end of range + high: high end of range + n: number of places to evaluate + + Returns: new Pmf + """ + label = options.pop('label', '') + xs, ds = self.Render(**options) + return Pmf(dict(zip(xs, ds)), label=label) + + def Render(self, **options): + """Generates a sequence of points suitable for plotting. + + If options includes low and high, it must also include n; + in that case the density is evaluated an n locations between + low and high, including both. + + If options includes xs, the density is evaluate at those location. + + Otherwise, self.GetLinspace is invoked to provide the locations. + + Returns: + tuple of (xs, densities) + """ + low, high = options.pop('low', None), options.pop('high', None) + if low is not None and high is not None: + n = options.pop('n', 101) + xs = np.linspace(low, high, n) + else: + xs = options.pop('xs', None) + if xs is None: + xs = self.GetLinspace() + + ds = self.Density(xs) + return xs, ds + + def Items(self): + """Generates a sequence of (value, probability) pairs. + """ + return zip(*self.Render()) + + +class NormalPdf(Pdf): + """Represents the PDF of a Normal distribution.""" + + def __init__(self, mu=0, sigma=1, label=None): + """Constructs a Normal Pdf with given mu and sigma. + + mu: mean + sigma: standard deviation + label: string + """ + self.mu = mu + self.sigma = sigma + self.label = label if label is not None else '_nolegend_' + + def __str__(self): + return 'NormalPdf(%f, %f)' % (self.mu, self.sigma) + + def GetLinspace(self): + """Get a linspace for plotting. + + Returns: numpy array + """ + low, high = self.mu-3*self.sigma, self.mu+3*self.sigma + return np.linspace(low, high, 101) + + def Density(self, xs): + """Evaluates this Pdf at xs. + + xs: scalar or sequence of floats + + returns: float or NumPy array of probability density + """ + return stats.norm.pdf(xs, self.mu, self.sigma) + + +class ExponentialPdf(Pdf): + """Represents the PDF of an exponential distribution.""" + + def __init__(self, lam=1, label=None): + """Constructs an exponential Pdf with given parameter. + + lam: rate parameter + label: string + """ + self.lam = lam + self.label = label if label is not None else '_nolegend_' + + def __str__(self): + return 'ExponentialPdf(%f)' % (self.lam) + + def GetLinspace(self): + """Get a linspace for plotting. + + Returns: numpy array + """ + low, high = 0, 5.0/self.lam + return np.linspace(low, high, 101) + + def Density(self, xs): + """Evaluates this Pdf at xs. + + xs: scalar or sequence of floats + + returns: float or NumPy array of probability density + """ + return stats.expon.pdf(xs, scale=1.0/self.lam) + + +class EstimatedPdf(Pdf): + """Represents a PDF estimated by KDE.""" + + def __init__(self, sample, label=None): + """Estimates the density function based on a sample. + + sample: sequence of data + label: string + """ + self.label = label if label is not None else '_nolegend_' + self.kde = stats.gaussian_kde(sample) + low = min(sample) + high = max(sample) + self.linspace = np.linspace(low, high, 101) + + def __str__(self): + return 'EstimatedPdf(label=%s)' % str(self.label) + + def GetLinspace(self): + """Get a linspace for plotting. + + Returns: numpy array + """ + return self.linspace + + def Density(self, xs): + """Evaluates this Pdf at xs. + + returns: float or NumPy array of probability density + """ + return self.kde.evaluate(xs) + + +def CredibleInterval(pmf, percentage=90): + """Computes a credible interval for a given distribution. + + If percentage=90, computes the 90% CI. + + Args: + pmf: Pmf object representing a posterior distribution + percentage: float between 0 and 100 + + Returns: + sequence of two floats, low and high + """ + cdf = pmf.MakeCdf() + prob = (1 - percentage / 100.0) / 2 + interval = cdf.Value(prob), cdf.Value(1 - prob) + return interval + + +def PmfProbLess(pmf1, pmf2): + """Probability that a value from pmf1 is less than a value from pmf2. + + Args: + pmf1: Pmf object + pmf2: Pmf object + + Returns: + float probability + """ + total = 0.0 + for v1, p1 in pmf1.Items(): + for v2, p2 in pmf2.Items(): + if v1 < v2: + total += p1 * p2 + return total + + +def PmfProbGreater(pmf1, pmf2): + """Probability that a value from pmf1 is less than a value from pmf2. + + Args: + pmf1: Pmf object + pmf2: Pmf object + + Returns: + float probability + """ + total = 0.0 + for v1, p1 in pmf1.Items(): + for v2, p2 in pmf2.Items(): + if v1 > v2: + total += p1 * p2 + return total + + +def PmfProbEqual(pmf1, pmf2): + """Probability that a value from pmf1 equals a value from pmf2. + + Args: + pmf1: Pmf object + pmf2: Pmf object + + Returns: + float probability + """ + total = 0.0 + for v1, p1 in pmf1.Items(): + for v2, p2 in pmf2.Items(): + if v1 == v2: + total += p1 * p2 + return total + + +def RandomSum(dists): + """Chooses a random value from each dist and returns the sum. + + dists: sequence of Pmf or Cdf objects + + returns: numerical sum + """ + total = sum(dist.Random() for dist in dists) + return total + + +def SampleSum(dists, n): + """Draws a sample of sums from a list of distributions. + + dists: sequence of Pmf or Cdf objects + n: sample size + + returns: new Pmf of sums + """ + pmf = Pmf(RandomSum(dists) for i in range(n)) + return pmf + + +def EvalNormalPdf(x, mu, sigma): + """Computes the unnormalized PDF of the normal distribution. + + x: value + mu: mean + sigma: standard deviation + + returns: float probability density + """ + return stats.norm.pdf(x, mu, sigma) + + +def MakeNormalPmf(mu, sigma, num_sigmas, n=201): + """Makes a PMF discrete approx to a Normal distribution. + + mu: float mean + sigma: float standard deviation + num_sigmas: how many sigmas to extend in each direction + n: number of values in the Pmf + + returns: normalized Pmf + """ + pmf = Pmf() + low = mu - num_sigmas * sigma + high = mu + num_sigmas * sigma + + for x in np.linspace(low, high, n): + p = EvalNormalPdf(x, mu, sigma) + pmf.Set(x, p) + pmf.Normalize() + return pmf + + +def EvalBinomialPmf(k, n, p): + """Evaluates the binomial PMF. + + Returns the probabily of k successes in n trials with probability p. + """ + return stats.binom.pmf(k, n, p) + + +def EvalHypergeomPmf(k, N, K, n): + """Evaluates the hypergeometric PMF. + + Returns the probabily of k successes in n trials from a population + N with K successes in it. + """ + return stats.hypergeom.pmf(k, N, K, n) + + +def EvalPoissonPmf(k, lam): + """Computes the Poisson PMF. + + k: number of events + lam: parameter lambda in events per unit time + + returns: float probability + """ + # don't use the scipy function (yet). for lam=0 it returns NaN; + # should be 0.0 + # return stats.poisson.pmf(k, lam) + return lam ** k * math.exp(-lam) / special.gamma(k+1) + + +def MakePoissonPmf(lam, high, step=1): + """Makes a PMF discrete approx to a Poisson distribution. + + lam: parameter lambda in events per unit time + high: upper bound of the Pmf + + returns: normalized Pmf + """ + pmf = Pmf() + for k in range(0, high + 1, step): + p = EvalPoissonPmf(k, lam) + pmf.Set(k, p) + pmf.Normalize() + return pmf + + +def EvalExponentialPdf(x, lam): + """Computes the exponential PDF. + + x: value + lam: parameter lambda in events per unit time + + returns: float probability density + """ + return lam * math.exp(-lam * x) + + +def EvalExponentialCdf(x, lam): + """Evaluates CDF of the exponential distribution with parameter lam.""" + return 1 - math.exp(-lam * x) + + +def MakeExponentialPmf(lam, high, n=200): + """Makes a PMF discrete approx to an exponential distribution. + + lam: parameter lambda in events per unit time + high: upper bound + n: number of values in the Pmf + + returns: normalized Pmf + """ + pmf = Pmf() + for x in np.linspace(0, high, n): + p = EvalExponentialPdf(x, lam) + pmf.Set(x, p) + pmf.Normalize() + return pmf + + +def StandardNormalCdf(x): + """Evaluates the CDF of the standard Normal distribution. + + See http://en.wikipedia.org/wiki/Normal_distribution + #Cumulative_distribution_function + + Args: + x: float + + Returns: + float + """ + return (math.erf(x / ROOT2) + 1) / 2 + + +def EvalNormalCdf(x, mu=0, sigma=1): + """Evaluates the CDF of the normal distribution. + + Args: + x: float + + mu: mean parameter + + sigma: standard deviation parameter + + Returns: + float + """ + return stats.norm.cdf(x, loc=mu, scale=sigma) + + +def EvalNormalCdfInverse(p, mu=0, sigma=1): + """Evaluates the inverse CDF of the normal distribution. + + See http://en.wikipedia.org/wiki/Normal_distribution#Quantile_function + + Args: + p: float + + mu: mean parameter + + sigma: standard deviation parameter + + Returns: + float + """ + return stats.norm.ppf(p, loc=mu, scale=sigma) + + +def EvalLognormalCdf(x, mu=0, sigma=1): + """Evaluates the CDF of the lognormal distribution. + + x: float or sequence + mu: mean parameter + sigma: standard deviation parameter + + Returns: float or sequence + """ + return stats.lognorm.cdf(x, loc=mu, scale=sigma) + + +def RenderExpoCdf(lam, low, high, n=101): + """Generates sequences of xs and ps for an exponential CDF. + + lam: parameter + low: float + high: float + n: number of points to render + + returns: numpy arrays (xs, ps) + """ + xs = np.linspace(low, high, n) + ps = 1 - np.exp(-lam * xs) + #ps = stats.expon.cdf(xs, scale=1.0/lam) + return xs, ps + + +def RenderNormalCdf(mu, sigma, low, high, n=101): + """Generates sequences of xs and ps for a Normal CDF. + + mu: parameter + sigma: parameter + low: float + high: float + n: number of points to render + + returns: numpy arrays (xs, ps) + """ + xs = np.linspace(low, high, n) + ps = stats.norm.cdf(xs, mu, sigma) + return xs, ps + + +def RenderParetoCdf(xmin, alpha, low, high, n=50): + """Generates sequences of xs and ps for a Pareto CDF. + + xmin: parameter + alpha: parameter + low: float + high: float + n: number of points to render + + returns: numpy arrays (xs, ps) + """ + if low < xmin: + low = xmin + xs = np.linspace(low, high, n) + ps = 1 - (xs / xmin) ** -alpha + #ps = stats.pareto.cdf(xs, scale=xmin, b=alpha) + return xs, ps + + +class Beta(object): + """Represents a Beta distribution. + + See http://en.wikipedia.org/wiki/Beta_distribution + """ + def __init__(self, alpha=1, beta=1, label=None): + """Initializes a Beta distribution.""" + self.alpha = alpha + self.beta = beta + self.label = label if label is not None else '_nolegend_' + + def Update(self, data): + """Updates a Beta distribution. + + data: pair of int (heads, tails) + """ + heads, tails = data + self.alpha += heads + self.beta += tails + + def Mean(self): + """Computes the mean of this distribution.""" + return self.alpha / (self.alpha + self.beta) + + def Random(self): + """Generates a random variate from this distribution.""" + return random.betavariate(self.alpha, self.beta) + + def Sample(self, n): + """Generates a random sample from this distribution. + + n: int sample size + """ + size = n, + return np.random.beta(self.alpha, self.beta, size) + + def EvalPdf(self, x): + """Evaluates the PDF at x.""" + return x ** (self.alpha - 1) * (1 - x) ** (self.beta - 1) + + def MakePmf(self, steps=101, label=None): + """Returns a Pmf of this distribution. + + Note: Normally, we just evaluate the PDF at a sequence + of points and treat the probability density as a probability + mass. + + But if alpha or beta is less than one, we have to be + more careful because the PDF goes to infinity at x=0 + and x=1. In that case we evaluate the CDF and compute + differences. + """ + if self.alpha < 1 or self.beta < 1: + cdf = self.MakeCdf() + pmf = cdf.MakePmf() + return pmf + + xs = [i / (steps - 1.0) for i in range(steps)] + probs = [self.EvalPdf(x) for x in xs] + pmf = Pmf(dict(zip(xs, probs)), label=label) + return pmf + + def MakeCdf(self, steps=101): + """Returns the CDF of this distribution.""" + xs = [i / (steps - 1.0) for i in range(steps)] + ps = [special.betainc(self.alpha, self.beta, x) for x in xs] + cdf = Cdf(xs, ps) + return cdf + + +class Dirichlet(object): + """Represents a Dirichlet distribution. + + See http://en.wikipedia.org/wiki/Dirichlet_distribution + """ + + def __init__(self, n, conc=1, label=None): + """Initializes a Dirichlet distribution. + + n: number of dimensions + conc: concentration parameter (smaller yields more concentration) + label: string label + """ + if n < 2: + raise ValueError('A Dirichlet distribution with ' + 'n<2 makes no sense') + + self.n = n + self.params = np.ones(n, dtype=np.float) * conc + self.label = label if label is not None else '_nolegend_' + + def Update(self, data): + """Updates a Dirichlet distribution. + + data: sequence of observations, in order corresponding to params + """ + m = len(data) + self.params[:m] += data + + def Random(self): + """Generates a random variate from this distribution. + + Returns: normalized vector of fractions + """ + p = np.random.gamma(self.params) + return p / p.sum() + + def Likelihood(self, data): + """Computes the likelihood of the data. + + Selects a random vector of probabilities from this distribution. + + Returns: float probability + """ + m = len(data) + if self.n < m: + return 0 + + x = data + p = self.Random() + q = p[:m] ** x + return q.prod() + + def LogLikelihood(self, data): + """Computes the log likelihood of the data. + + Selects a random vector of probabilities from this distribution. + + Returns: float log probability + """ + m = len(data) + if self.n < m: + return float('-inf') + + x = self.Random() + y = np.log(x[:m]) * data + return y.sum() + + def MarginalBeta(self, i): + """Computes the marginal distribution of the ith element. + + See http://en.wikipedia.org/wiki/Dirichlet_distribution + #Marginal_distributions + + i: int + + Returns: Beta object + """ + alpha0 = self.params.sum() + alpha = self.params[i] + return Beta(alpha, alpha0 - alpha) + + def PredictivePmf(self, xs, label=None): + """Makes a predictive distribution. + + xs: values to go into the Pmf + + Returns: Pmf that maps from x to the mean prevalence of x + """ + alpha0 = self.params.sum() + ps = self.params / alpha0 + return Pmf(zip(xs, ps), label=label) + + +def BinomialCoef(n, k): + """Compute the binomial coefficient "n choose k". + + n: number of trials + k: number of successes + + Returns: float + """ + return scipy.misc.comb(n, k) + + +def LogBinomialCoef(n, k): + """Computes the log of the binomial coefficient. + + http://math.stackexchange.com/questions/64716/ + approximating-the-logarithm-of-the-binomial-coefficient + + n: number of trials + k: number of successes + + Returns: float + """ + return n * math.log(n) - k * math.log(k) - (n - k) * math.log(n - k) + + +def NormalProbability(ys, jitter=0.0): + """Generates data for a normal probability plot. + + ys: sequence of values + jitter: float magnitude of jitter added to the ys + + returns: numpy arrays xs, ys + """ + n = len(ys) + xs = np.random.normal(0, 1, n) + xs.sort() + + if jitter: + ys = Jitter(ys, jitter) + else: + ys = np.array(ys) + ys.sort() + + return xs, ys + + +def Jitter(values, jitter=0.5): + """Jitters the values by adding a uniform variate in (-jitter, jitter). + + values: sequence + jitter: scalar magnitude of jitter + + returns: new numpy array + """ + n = len(values) + return np.random.uniform(-jitter, +jitter, n) + values + + +def NormalProbabilityPlot(sample, fit_color='0.8', **options): + """Makes a normal probability plot with a fitted line. + + sample: sequence of numbers + fit_color: color string for the fitted line + options: passed along to Plot + """ + xs, ys = NormalProbability(sample) + mean, var = MeanVar(sample) + std = math.sqrt(var) + + fit = FitLine(xs, mean, std) + thinkplot.Plot(*fit, color=fit_color, label='model') + + xs, ys = NormalProbability(sample) + thinkplot.Plot(xs, ys, **options) + + +def Mean(xs): + """Computes mean. + + xs: sequence of values + + returns: float mean + """ + return np.mean(xs) + + +def Var(xs, mu=None, ddof=0): + """Computes variance. + + xs: sequence of values + mu: option known mean + ddof: delta degrees of freedom + + returns: float + """ + xs = np.asarray(xs) + + if mu is None: + mu = xs.mean() + + ds = xs - mu + return np.dot(ds, ds) / (len(xs) - ddof) + + +def Std(xs, mu=None, ddof=0): + """Computes standard deviation. + + xs: sequence of values + mu: option known mean + ddof: delta degrees of freedom + + returns: float + """ + var = Var(xs, mu, ddof) + return math.sqrt(var) + + +def MeanVar(xs, ddof=0): + """Computes mean and variance. + + Based on http://stackoverflow.com/questions/19391149/ + numpy-mean-and-variance-from-single-function + + xs: sequence of values + ddof: delta degrees of freedom + + returns: pair of float, mean and var + """ + xs = np.asarray(xs) + mean = xs.mean() + s2 = Var(xs, mean, ddof) + return mean, s2 + + +def Trim(t, p=0.01): + """Trims the largest and smallest elements of t. + + Args: + t: sequence of numbers + p: fraction of values to trim off each end + + Returns: + sequence of values + """ + n = int(p * len(t)) + t = sorted(t)[n:-n] + return t + + +def TrimmedMean(t, p=0.01): + """Computes the trimmed mean of a sequence of numbers. + + Args: + t: sequence of numbers + p: fraction of values to trim off each end + + Returns: + float + """ + t = Trim(t, p) + return Mean(t) + + +def TrimmedMeanVar(t, p=0.01): + """Computes the trimmed mean and variance of a sequence of numbers. + + Side effect: sorts the list. + + Args: + t: sequence of numbers + p: fraction of values to trim off each end + + Returns: + float + """ + t = Trim(t, p) + mu, var = MeanVar(t) + return mu, var + + +def CohenEffectSize(group1, group2): + """Compute Cohen's d. + + group1: Series or NumPy array + group2: Series or NumPy array + + returns: float + """ + diff = group1.mean() - group2.mean() + + n1, n2 = len(group1), len(group2) + var1 = group1.var() + var2 = group2.var() + + pooled_var = (n1 * var1 + n2 * var2) / (n1 + n2) + d = diff / math.sqrt(pooled_var) + return d + + +def Cov(xs, ys, meanx=None, meany=None): + """Computes Cov(X, Y). + + Args: + xs: sequence of values + ys: sequence of values + meanx: optional float mean of xs + meany: optional float mean of ys + + Returns: + Cov(X, Y) + """ + xs = np.asarray(xs) + ys = np.asarray(ys) + + if meanx is None: + meanx = np.mean(xs) + if meany is None: + meany = np.mean(ys) + + cov = np.dot(xs-meanx, ys-meany) / len(xs) + return cov + + +def Corr(xs, ys): + """Computes Corr(X, Y). + + Args: + xs: sequence of values + ys: sequence of values + + Returns: + Corr(X, Y) + """ + xs = np.asarray(xs) + ys = np.asarray(ys) + + meanx, varx = MeanVar(xs) + meany, vary = MeanVar(ys) + + corr = Cov(xs, ys, meanx, meany) / math.sqrt(varx * vary) + + return corr + + +def SerialCorr(series, lag=1): + """Computes the serial correlation of a series. + + series: Series + lag: integer number of intervals to shift + + returns: float correlation + """ + xs = series[lag:] + ys = series.shift(lag)[lag:] + corr = Corr(xs, ys) + return corr + + +def SpearmanCorr(xs, ys): + """Computes Spearman's rank correlation. + + Args: + xs: sequence of values + ys: sequence of values + + Returns: + float Spearman's correlation + """ + xranks = pandas.Series(xs).rank() + yranks = pandas.Series(ys).rank() + return Corr(xranks, yranks) + + +def MapToRanks(t): + """Returns a list of ranks corresponding to the elements in t. + + Args: + t: sequence of numbers + + Returns: + list of integer ranks, starting at 1 + """ + # pair up each value with its index + pairs = enumerate(t) + + # sort by value + sorted_pairs = sorted(pairs, key=itemgetter(1)) + + # pair up each pair with its rank + ranked = enumerate(sorted_pairs) + + # sort by index + resorted = sorted(ranked, key=lambda trip: trip[1][0]) + + # extract the ranks + ranks = [trip[0]+1 for trip in resorted] + return ranks + + +def LeastSquares(xs, ys): + """Computes a linear least squares fit for ys as a function of xs. + + Args: + xs: sequence of values + ys: sequence of values + + Returns: + tuple of (intercept, slope) + """ + meanx, varx = MeanVar(xs) + meany = Mean(ys) + + slope = Cov(xs, ys, meanx, meany) / varx + inter = meany - slope * meanx + + return inter, slope + + +def FitLine(xs, inter, slope): + """Fits a line to the given data. + + xs: sequence of x + + returns: tuple of numpy arrays (sorted xs, fit ys) + """ + fit_xs = np.sort(xs) + fit_ys = inter + slope * fit_xs + return fit_xs, fit_ys + + +def Residuals(xs, ys, inter, slope): + """Computes residuals for a linear fit with parameters inter and slope. + + Args: + xs: independent variable + ys: dependent variable + inter: float intercept + slope: float slope + + Returns: + list of residuals + """ + xs = np.asarray(xs) + ys = np.asarray(ys) + res = ys - (inter + slope * xs) + return res + + +def CoefDetermination(ys, res): + """Computes the coefficient of determination (R^2) for given residuals. + + Args: + ys: dependent variable + res: residuals + + Returns: + float coefficient of determination + """ + return 1 - Var(res) / Var(ys) + + +def CorrelatedGenerator(rho): + """Generates standard normal variates with serial correlation. + + rho: target coefficient of correlation + + Returns: iterable + """ + x = random.gauss(0, 1) + yield x + + sigma = math.sqrt(1 - rho**2) + while True: + x = random.gauss(x * rho, sigma) + yield x + + +def CorrelatedNormalGenerator(mu, sigma, rho): + """Generates normal variates with serial correlation. + + mu: mean of variate + sigma: standard deviation of variate + rho: target coefficient of correlation + + Returns: iterable + """ + for x in CorrelatedGenerator(rho): + yield x * sigma + mu + + +def RawMoment(xs, k): + """Computes the kth raw moment of xs. + """ + return sum(x**k for x in xs) / len(xs) + + +def CentralMoment(xs, k): + """Computes the kth central moment of xs. + """ + mean = RawMoment(xs, 1) + return sum((x - mean)**k for x in xs) / len(xs) + + +def StandardizedMoment(xs, k): + """Computes the kth standardized moment of xs. + """ + var = CentralMoment(xs, 2) + std = math.sqrt(var) + return CentralMoment(xs, k) / std**k + + +def Skewness(xs): + """Computes skewness. + """ + return StandardizedMoment(xs, 3) + + +def Median(xs): + """Computes the median (50th percentile) of a sequence. + + xs: sequence or anything else that can initialize a Cdf + + returns: float + """ + cdf = Cdf(xs) + return cdf.Value(0.5) + + +def IQR(xs): + """Computes the interquartile of a sequence. + + xs: sequence or anything else that can initialize a Cdf + + returns: pair of floats + """ + cdf = Cdf(xs) + return cdf.Value(0.25), cdf.Value(0.75) + + +def PearsonMedianSkewness(xs): + """Computes the Pearson median skewness. + """ + median = Median(xs) + mean = RawMoment(xs, 1) + var = CentralMoment(xs, 2) + std = math.sqrt(var) + gp = 3 * (mean - median) / std + return gp + + +class FixedWidthVariables(object): + """Represents a set of variables in a fixed width file.""" + + def __init__(self, variables, index_base=0): + """Initializes. + + variables: DataFrame + index_base: are the indices 0 or 1 based? + + Attributes: + colspecs: list of (start, end) index tuples + names: list of string variable names + """ + self.variables = variables + + # note: by default, subtract 1 from colspecs + self.colspecs = variables[['start', 'end']] - index_base + + # convert colspecs to a list of pair of int + self.colspecs = self.colspecs.astype(np.int).values.tolist() + self.names = variables['name'] + + def ReadFixedWidth(self, filename, **options): + """Reads a fixed width ASCII file. + + filename: string filename + + returns: DataFrame + """ + df = pandas.read_fwf(filename, + colspecs=self.colspecs, + names=self.names, + **options) + return df + + +def ReadStataDct(dct_file, **options): + """Reads a Stata dictionary file. + + dct_file: string filename + options: dict of options passed to open() + + returns: FixedWidthVariables object + """ + type_map = dict(byte=int, int=int, long=int, float=float, double=float) + + var_info = [] + for line in open(dct_file, **options): + match = re.search( r'_column\(([^)]*)\)', line) + if match: + start = int(match.group(1)) + t = line.split() + vtype, name, fstring = t[1:4] + name = name.lower() + if vtype.startswith('str'): + vtype = str + else: + vtype = type_map[vtype] + long_desc = ' '.join(t[4:]).strip('"') + var_info.append((start, vtype, name, fstring, long_desc)) + + columns = ['start', 'type', 'name', 'fstring', 'desc'] + variables = pandas.DataFrame(var_info, columns=columns) + + # fill in the end column by shifting the start column + variables['end'] = variables.start.shift(-1) + variables.loc[len(variables)-1, 'end'] = 0 + + dct = FixedWidthVariables(variables, index_base=1) + return dct + + +def Resample(xs, n=None): + """Draw a sample from xs with the same length as xs. + + xs: sequence + n: sample size (default: len(xs)) + + returns: NumPy array + """ + if n is None: + n = len(xs) + return np.random.choice(xs, n, replace=True) + + +def SampleRows(df, nrows, replace=False): + """Choose a sample of rows from a DataFrame. + + df: DataFrame + nrows: number of rows + replace: whether to sample with replacement + + returns: DataDf + """ + indices = np.random.choice(df.index, nrows, replace=replace) + sample = df.loc[indices] + return sample + + +def ResampleRows(df): + """Resamples rows from a DataFrame. + + df: DataFrame + + returns: DataFrame + """ + return SampleRows(df, len(df), replace=True) + + +def ResampleRowsWeighted(df, column='finalwgt'): + """Resamples a DataFrame using probabilities proportional to given column. + + df: DataFrame + column: string column name to use as weights + + returns: DataFrame + """ + weights = df[column] + cdf = Cdf(dict(weights)) + indices = cdf.Sample(len(weights)) + sample = df.loc[indices] + return sample + + +def PercentileRow(array, p): + """Selects the row from a sorted array that maps to percentile p. + + p: float 0--100 + + returns: NumPy array (one row) + """ + rows, cols = array.shape + index = int(rows * p / 100) + return array[index,] + + +def PercentileRows(ys_seq, percents): + """Given a collection of lines, selects percentiles along vertical axis. + + For example, if ys_seq contains simulation results like ys as a + function of time, and percents contains (5, 95), the result would + be a 90% CI for each vertical slice of the simulation results. + + ys_seq: sequence of lines (y values) + percents: list of percentiles (0-100) to select + + returns: list of NumPy arrays, one for each percentile + """ + nrows = len(ys_seq) + ncols = len(ys_seq[0]) + array = np.zeros((nrows, ncols)) + + for i, ys in enumerate(ys_seq): + array[i,] = ys + + array = np.sort(array, axis=0) + + rows = [PercentileRow(array, p) for p in percents] + return rows + + +def Smooth(xs, sigma=2, **options): + """Smooths a NumPy array with a Gaussian filter. + + xs: sequence + sigma: standard deviation of the filter + """ + return ndimage.filters.gaussian_filter1d(xs, sigma, **options) + + +class HypothesisTest(object): + """Represents a hypothesis test.""" + + def __init__(self, data): + """Initializes. + + data: data in whatever form is relevant + """ + self.data = data + self.MakeModel() + self.actual = self.TestStatistic(data) + self.test_stats = None + self.test_cdf = None + + def PValue(self, iters=1000): + """Computes the distribution of the test statistic and p-value. + + iters: number of iterations + + returns: float p-value + """ + self.test_stats = [self.TestStatistic(self.RunModel()) + for _ in range(iters)] + self.test_cdf = Cdf(self.test_stats) + + count = sum(1 for x in self.test_stats if x >= self.actual) + return count / iters + + def MaxTestStat(self): + """Returns the largest test statistic seen during simulations. + """ + return max(self.test_stats) + + def PlotCdf(self, label=None): + """Draws a Cdf with vertical lines at the observed test stat. + """ + def VertLine(x): + """Draws a vertical line at x.""" + thinkplot.Plot([x, x], [0, 1], color='0.8') + + VertLine(self.actual) + thinkplot.Cdf(self.test_cdf, label=label) + + def TestStatistic(self, data): + """Computes the test statistic. + + data: data in whatever form is relevant + """ + raise UnimplementedMethodException() + + def MakeModel(self): + """Build a model of the null hypothesis. + """ + pass + + def RunModel(self): + """Run the model of the null hypothesis. + + returns: simulated data + """ + raise UnimplementedMethodException() + + +def main(): + pass + + +if __name__ == '__main__': + main()