* This do-file takes the raw data files from IPUMS 100% sample of the 1880 federal census and 
* constructs the city-level segregation measures.

clear

set more 1


cd "$segdir"

use "IPUMS_1880_Extract_JP0086.dta" //1880 100% sample from IPUMS -- keeping data for household heads only	

	drop if city==0 & incorp==0 // 34% of HH don't live in identifiable cities. Drop them.

	*There are some unincorporated portions of a few cities that we want to fix.
	replace incorp=80060 if city==830  //Bridgeport, CT
	replace incorp=80200 if city==4030 //Meriden, CT
	replace incorp=80210 if city==4122 //Middletown, CT
	replace incorp=80230 if city==4470 //New Britain, CT
	replace incorp=80280 if city==4870 //Norwalk, CT
	replace incorp=80290 if city==4890 //Norwich, CT
	replace incorp=80370 if city==6730 //Stamford, CT
	replace incorp=80420 if city==7250 //Waterbury, CT
	replace incorp=80600 if city==6014 //Rutland, VT
	
	replace city=7630 if statefip==39 & incorp==89430 //Youngstown has 50 residents that lie outside city boundary but are still a part of the broader incorporated code
	
	*Combine Georgetown and DC as that is how it is reported in Baker
	replace city=7230 if city==7231
	replace incorp=80020 if city==7230
	
	*Now places are unqiuely identified by statefip city and incorp. Some incorporated places don't have city codes (and the other way too).
	egen place_id=group(statefip city incorp)
	
	*Because we want to treat annexed cities as one, generate master sup and ed variables
	egen master_sup=group(place_id supdist)
	egen master_ed=group(place_id enumdist)
	
	*Because of annexed cities, generate some combined control variables
	preserve
		gen tot_hh=1
		
		collapse (sum) tot_hh, by(region statefip city incorp)
		
		save "_tempcontrols.dta", replace
	restore
	
	
	* First order the household heads by the order in which they appear in the census manuscript.
	* This is done by sorting every household head by his line number (line) on his census
	* manuscript page (pageno). Page numbers are not unique, they are repeated across census reels
	* and occasionally within reels. For this reason it is necessary to first sort by the reel
	* number and then by the microfilm sequence number (microseq).
	 
	* Note that page numbers are reported on certain microfilm reels

	sort reel microseq pageno line

	* Any two individuals on the same census manuscript page should have the same values for reel, microseq and pageno.
	* The sample contains only household heads so the line numbers will typically not be consecutive integers
	* after the above sorting command (each member of the household is on a separate line so their will typically
	* be gaps in line number after the sorting).
	* However, line number remains useful in that sorting by line number within a census manuscript page should sort
	* the household heads as the enumerator would walk down the street.  So the previous and next observations
	* should be the adjacent neighbors.

	* Two different definitions of neighbors will be used.  The first identifies next door neighbors as those household heads
	* appearing directly before and directly after an individual's household on the census manuscript page.

	* The second definition adds an additional restriction that the street name must be given and match for two individuals to
	* be considered neighbors and relaxes the restriction that the neighbors must appear on the same manuscript page, allowing
	* for the neighbors to be the last household on one page and the first household on the next page.
	* All variables based on this second definition will contain 'alt' in the variable name.
	* Note that there are many observations missing street name.

		
		gen sorting_trait = 0 if $sorter~=.
		replace sorting_trait = 1 if $criteria
		
		* This variable should be binary, dividing the population into two groups. 
		* Here we are using nativity, defining the two groups as native born and foreign born.
		* For simplicity in interpreting the segregation measure across different sorting traits, construct the sorting_trait
		* variable such that the majority group gets a value of 0 and the minority group gets a value of 1.


	gen neighbor1trait=sorting_trait[_n-1] if reel==reel[_n-1] & microseq==microseq[_n-1] & pageno==pageno[_n-1] & place_id==place_id[_n-1]
	gen neighbor2trait=sorting_trait[_n+1] if reel==reel[_n+1] & microseq==microseq[_n+1] & pageno==pageno[_n+1] & place_id==place_id[_n+1]

	gen neighbor1traitalt=sorting_trait[_n-1] if reel==reel[_n-1] & microseq==microseq[_n-1] & street==street[_n-1] & place_id==place_id[_n-1] & street~=""
	gen neighbor2traitalt=sorting_trait[_n+1] if reel==reel[_n+1] & microseq==microseq[_n+1] & street==street[_n+1] & place_id==place_id[_n+1] & street~=""

	gen neighbor1present=1 if neighbor1trait~=.
	replace neighbor1present=0 if neighbor1present~=1

	gen neighbor2present=1 if neighbor2trait~=.
	replace neighbor2present=0 if neighbor2present~=1

	gen neighbor1presentalt=1 if neighbor1traitalt~=.
	replace neighbor1presentalt=0 if neighbor1presentalt~=1

	gen neighbor2presentalt=1 if neighbor2traitalt~=.
	replace neighbor2presentalt=0 if neighbor2presentalt~=1

	gen neighborstotal=neighbor1present+neighbor2present
	gen neighborstotalalt=neighbor1presentalt+neighbor2presentalt

	gen majoritycounter1=0
	replace majoritycounter1=1 if sorting_trait==0 & (neighbor1present==1 | neighbor2present==1)
	gen majoritycounter2=0
	replace majoritycounter2=1 if sorting_trait==0 & neighbor1present==1 & neighbor2present==1

	gen minoritycounter1=0
	replace minoritycounter1=1 if sorting_trait==1 & (neighbor1present==1 | neighbor2present==1)
	gen minoritycounter2=0
	replace minoritycounter2=1 if sorting_trait==1 & neighbor1present==1 & neighbor2present==1

	gen majoritycounter1alt=0
	replace majoritycounter1alt=1 if sorting_trait==0 & (neighbor1presentalt==1 | neighbor2presentalt==1)
	gen majoritycounter2alt=0
	replace majoritycounter2alt=1 if sorting_trait==0 & neighbor1presentalt==1 & neighbor2presentalt==1

	gen minoritycounter1alt=0
	replace minoritycounter1alt=1 if sorting_trait==1 & (neighbor1presentalt==1 | neighbor2presentalt==1)
	gen minoritycounter2alt=0
	replace minoritycounter2alt=1 if sorting_trait==1 & neighbor1presentalt==1 & neighbor2presentalt==1

	gen minoritycounterall=0
	replace minoritycounterall=1 if sorting_trait==1
	gen majoritycounterall=0
	replace majoritycounterall=1 if sorting_trait==0
	
	
	*Collapse to ED level 
	collapse (sum) min_all=minoritycounterall maj_all=majoritycounterall n_min_po=minoritycounter1 n_min_pb=minoritycounter2 n_maj_po=majoritycounter1 n_maj_pb=majoritycounter2 n_min_so=minoritycounter1alt n_min_sb=minoritycounter2alt n_maj_so=majoritycounter1alt n_maj_sb=majoritycounter2alt, by(statefip city incorp master_sup master_ed)


save "_temp_ED_counts.dta", replace

* The following variables are in the enumeration district-level data file at this point:

* min_all - total number of minority household heads in city
* maj_all - total number of majority household heads in city
* n_min_po - total number of minority household heads with at least one neighbor observed, neighbors defined by manuscript page
* n_min_pb - total number of minority household heads with both neighbors observed, neighbors defined by manuscript page
* n_maj_po - total number of majority household heads with at least one neighbor observed, neighbors defined by manuscript page
* n_maj_pb - total number of majority household heads with both neighbors observed, neighbors defined by manuscript page
* n_min_so - total number of minority household heads with at least one neighbor observed, neighbors defined by street
* n_min_sb - total number of minority household heads with both neighbors observed, neighbors defined by street 
* n_maj_so - total number of majority household heads with at least one neighbor observed, neighbors defined by street
* n_maj_sb - total number of majority household heads with both neighbors observed, neighbors defined by street
* statefip - state FIPS code
* city -  IPUMS city code
* supdist - supervisor district for census enumeration
* enumdist - enumerator district for census enumeration (unique within supervisor district)

* Constructing the segregation measures:

sort statefip city incorp master_sup master_ed

gen min_maj_all=min_all+maj_all
replace min_maj_all=min_all if maj_all==. & min_all~=.
replace min_maj_all=maj_all if maj_all~=. & min_all==.
gen n_min_maj_po=n_min_po+n_maj_po
replace n_min_maj_po=n_min_po if n_maj_po==. & n_min_po~=.
replace n_min_maj_po=n_maj_po if n_maj_po~=. & n_min_po==.
gen n_min_maj_pb=n_min_pb+n_maj_pb
replace n_min_maj_pb=n_min_pb if n_maj_pb==. & n_min_pb~=.
replace n_min_maj_pb=n_maj_pb if n_maj_pb~=. & n_min_pb==.
gen n_min_maj_so=n_min_so+n_maj_so
replace n_min_maj_so=n_min_so if n_maj_so==. & n_min_so~=.
replace n_min_maj_so=n_min_so if n_maj_so~=. & n_min_so==.
gen n_min_maj_sb=n_min_sb+n_maj_sb
replace n_min_maj_sb=n_min_sb if n_maj_sb==. & n_min_sb~=.
replace n_min_maj_sb=n_maj_sb if n_maj_sb~=. & n_min_sb==.

gen all_flag=0
replace all_flag=1 if min_maj_all>0 & min_maj_all~=.
gen pb_flag=0
replace pb_flag=1 if n_min_maj_pb>0 & n_min_maj_pb~=.
gen po_flag=0
replace po_flag=1 if n_min_maj_po>0 & n_min_maj_po~=.
gen sb_flag=0
replace sb_flag=1 if n_min_maj_sb>0 & n_min_maj_sb~=.
gen so_flag=0
replace so_flag=1 if n_min_maj_so>0 & n_min_maj_so~=.

*Collapse to incorporated "city" level
collapse (sum) min_citytotal=min_all maj_citytotal=maj_all n_min_po_citytotal=n_min_po n_min_pb_citytotal=n_min_pb n_min_so_citytotal=n_min_so n_min_sb_citytotal=n_min_sb n_maj_po_citytotal=n_maj_po n_maj_pb_citytotal=n_maj_pb n_maj_so_citytotal=n_maj_so n_maj_sb_citytotal=n_maj_sb (min) min_maj_citymin=min_maj_all n_min_maj_po_citymin=n_min_maj_po n_min_maj_pb_citymin=n_min_maj_pb n_min_maj_so_citymin=n_min_maj_so n_min_maj_sb_citymin=n_min_maj_sb (sum) districts_all=all_flag districts_po=po_flag districts_pb=pb_flag districts_so=so_flag districts_sb=sb_flag, by(statefip city incorp)


* Now merge back in the ED counts

merge 1:m statefip city incorp using "_temp_ED_counts.dta"
	rm "_temp_ED_counts.dta"

gen min_maj_all=min_all+maj_all
replace min_maj_all=min_all if maj_all==. & min_all~=.
replace min_maj_all=maj_all if maj_all~=. & min_all==.
gen n_min_maj_po=n_min_po+n_maj_po
replace n_min_maj_po=n_min_po if n_maj_po==. & n_min_po~=.
replace n_min_maj_po=n_maj_po if n_maj_po~=. & n_min_po==.
gen n_min_maj_pb=n_min_pb+n_maj_pb
replace n_min_maj_pb=n_min_pb if n_maj_pb==. & n_min_pb~=.
replace n_min_maj_pb=n_maj_pb if n_maj_pb~=. & n_min_pb==.
gen n_min_maj_so=n_min_so+n_maj_so
replace n_min_maj_so=n_min_so if n_maj_so==. & n_min_so~=.
replace n_min_maj_so=n_maj_so if n_maj_so~=. & n_min_so==.
gen n_min_maj_sb=n_min_sb+n_maj_sb
replace n_min_maj_sb=n_min_sb if n_maj_sb==. & n_min_sb~=.
replace n_min_maj_sb=n_maj_sb if n_maj_sb~=. & n_min_sb==.

gen all_flag=0
replace all_flag=1 if min_maj_all>0 & min_maj_all~=.
gen pb_flag=0
replace pb_flag=1 if n_min_maj_pb>0 & n_min_maj_pb~=.
gen po_flag=0
replace po_flag=1 if n_min_maj_po>0 & n_min_maj_po~=.
gen sb_flag=0
replace sb_flag=1 if n_min_maj_sb>0 & n_min_maj_sb~=.
gen so_flag=0
replace so_flag=1 if n_min_maj_so>0 & n_min_maj_so~=.

gen dissim_all=.5*abs(min_all/min_citytotal-maj_all/maj_citytotal)
gen dissim_po=.5*abs(n_min_po/n_min_po_citytotal-n_maj_po/n_maj_po_citytotal)
gen dissim_pb=.5*abs(n_min_pb/n_min_pb_citytotal-n_maj_pb/n_maj_pb_citytotal)
gen dissim_so=.5*abs(n_min_so/n_min_so_citytotal-n_maj_so/n_maj_so_citytotal)
gen dissim_sb=.5*abs(n_min_sb/n_min_sb_citytotal-n_maj_sb/n_maj_sb_citytotal)

gen iso_denominator_all=min_citytotal/min_maj_citymin-min_citytotal/(min_citytotal+maj_citytotal)
replace iso_denominator_all=1-min_citytotal/(min_citytotal+maj_citytotal) if 1<(min_citytotal/min_maj_citymin)
gen iso_denominator_po=n_min_po_citytotal/n_min_maj_po_citymin-n_min_po_citytotal/(n_min_po_citytotal+n_maj_po_citytotal)
replace iso_denominator_po=1-n_min_po_citytotal/(n_min_po_citytotal+n_maj_po_citytotal) if 1<(n_min_po_citytotal/n_min_maj_po_citymin)
gen iso_denominator_pb=n_min_pb_citytotal/n_min_maj_pb_citymin-n_min_pb_citytotal/(n_min_pb_citytotal+n_maj_pb_citytotal)
replace iso_denominator_pb=1-n_min_pb_citytotal/(n_min_pb_citytotal+n_maj_pb_citytotal) if 1<(n_min_pb_citytotal/n_min_maj_pb_citymin)
gen iso_denominator_so=n_min_so_citytotal/n_min_maj_so_citymin-n_min_so_citytotal/(n_min_so_citytotal+n_maj_so_citytotal)
replace iso_denominator_so=1-n_min_so_citytotal/(n_min_so_citytotal+n_maj_so_citytotal) if 1<(n_min_so_citytotal/n_min_maj_so_citymin)
gen iso_denominator_sb=n_min_sb_citytotal/n_min_maj_sb_citymin-n_min_sb_citytotal/(n_min_sb_citytotal+n_maj_sb_citytotal)
replace iso_denominator_sb=1-n_min_sb_citytotal/(n_min_sb_citytotal+n_maj_sb_citytotal) if 1<(n_min_sb_citytotal/n_min_maj_sb_citymin)

gen iso_all=(min_all^2/(min_citytotal*(min_all+maj_all))-(1/districts_all)*min_citytotal/(min_citytotal+maj_citytotal))/iso_denominator_all if all_flag==1
gen iso_po=(n_min_po^2/(n_min_po_citytotal*(n_min_po+n_maj_po))-(1/districts_po)*n_min_po_citytotal/(n_min_po_citytotal+n_maj_po_citytotal))/iso_denominator_po if po_flag==1
gen iso_pb=(n_min_pb^2/(n_min_pb_citytotal*(n_min_pb+n_maj_pb))-(1/districts_pb)*n_min_pb_citytotal/(n_min_pb_citytotal+n_maj_pb_citytotal))/iso_denominator_pb if pb_flag==1
gen iso_so=(n_min_so^2/(n_min_so_citytotal*(n_min_so+n_maj_so))-(1/districts_so)*n_min_so_citytotal/(n_min_so_citytotal+n_maj_so_citytotal))/iso_denominator_so if so_flag==1
gen iso_sb=(n_min_sb^2/(n_min_sb_citytotal*(n_min_sb+n_maj_sb))-(1/districts_sb)*n_min_sb_citytotal/(n_min_sb_citytotal+n_maj_sb_citytotal))/iso_denominator_sb if sb_flag==1

collapse (sum) dissimilarity_all=dissim_all dissimilarity_po=dissim_po dissimilarity_pb=dissim_pb dissimilarity_so=dissim_so dissimilarity_sb=dissim_sb isolation_all=iso_all isolation_po=iso_po isolation_pb=iso_pb isolation_so=iso_so isolation_sb=iso_sb, by(statefip city incorp)

*Bring in controls

merge 1:1 statefip city incorp using "_tempcontrols.dta"
	rm "_tempcontrols.dta"
	drop _merge
	
*Now bring in the neighbor based measures
cd "$outdir"
merge 1:1 statefip city incorp using "1880_city_neighbor_$sorter.dta"
rm "1880_city_neighbor_$sorter.dta"
drop _merge

save "1880_city_segregation_$sorter.dta", replace