diff --git a/cenpy/moe/replicate_table_utils.py b/cenpy/moe/replicate_table_utils.py index 723abec..a4f8a3f 100644 --- a/cenpy/moe/replicate_table_utils.py +++ b/cenpy/moe/replicate_table_utils.py @@ -96,6 +96,11 @@ def read_replicate_file(fname): table = table.drop(["TBLID", "NAME", "ORDER", "CME", "TITLE"], axis=1) table = table.pivot(index="GEOID", columns="variable") table.columns.names = ["categories", "variables"] + # Standardize the names of the columns because the ACB's 2014 tables have + # lowercase titles while others are uppercase. + table = table.rename(columns = {"estimate":"ESTIMATE", + "moe": "MOE", + "se": "SE"}) return table @@ -472,9 +477,9 @@ def apply_func(func, data, params={}): Pandas 81 column dataframe, where the first column is the estimates and the remaining columns are the replicates. """ - estimates = func(data.estimate, **params) + estimates = func(data.ESTIMATE, **params) # subset just the replicates - replicates = data.drop(["estimate", "moe", "SE"], axis=1, level=0) + replicates = data.drop(["ESTIMATE", "MOE", "SE"], axis=1, level=0) # clean out unused column names replicates.columns = replicates.columns.remove_unused_levels() # apply the user function to each replicate @@ -488,7 +493,7 @@ def apply_func(func, data, params={}): ] rep_results = pd.concat(rep_results, axis=1, keys=replicates.columns.levels[0]) # cleanup - rep_results["estimate"] = estimates + rep_results["ESTIMATE"] = estimates rep_results = rep_results.replace([np.inf, -np.inf], 0) # per census documentation return rep_results diff --git a/cenpy/products.py b/cenpy/products.py index a0030db..c750066 100644 --- a/cenpy/products.py +++ b/cenpy/products.py @@ -1,7 +1,10 @@ +from .utilities import _replace_missing +from .utilities import _fuzzy_match +from .utilities import _coerce +from .utilities import _can_int from .remote import APIConnection from .explorer import fips_table as _ft from shapely import geometry -from fuzzywuzzy import fuzz from warnings import warn import geopandas import pandas @@ -16,8 +19,6 @@ __all__ = ["Decennial2010", "ACS"] -_ACS_MISSING = (-999999999, -888888888, -666666666, -555555555, -333333333, -222222222) - class _Product(object): """The fundamental building block to make pre-configured Census Products, like ACS or Decennial2010.""" @@ -852,7 +853,7 @@ def tables(self): result = stems.drop("GEO", axis=0, errors="ignore") self._stems = result # keep around the main tables only if they're not crosstabs (ending in alphanumeric) - self._tables = result.loc[[ix for ix in result.index if _can_int(ix[-1])]] + self._tables = result.loc[[ix for ix in result.index if can_int(ix[-1])]] return self._tables @property @@ -882,129 +883,3 @@ def crosstab_tables(self): ] return self._crosstabs - -############# -# UTILITIES # -############# - - -def _fuzzy_match(matchtarget, matchlist, return_table=False): - """ - Conduct a fuzzy match with matchtarget, within the list of possible match candidates in matchlist. - - Parameters - --------- - matchtarget : str - a string to be matched to a set of possible candidates - matchlist : list of str - a list (or iterable) containing strings we are interested in matching - return_table: bool - whether to return the full table of scored candidates, or to return only the single - best match. If False (the default), only the best match is returned. - - Notes - ----- - consult the docstring for Product.check_match for more information on how the actual matching - algorithm works. - """ - split = matchtarget.split(",") - if len(split) == 2: - target, state = split - elif len(split) == 1: - target = split[0] - else: - raise AssertionError( - "Uncertain place identifier {}. The place identifier should " - 'look something like "placename, state" or, for larger areas, ' - "like Combined Statistical Areas or Metropolitan Statistical Areas," - "placename1-placename2, state1-state2-state3".format(target) - ) - - table = pandas.DataFrame({"target": matchlist}) - table["score"] = table.target.apply( - lambda x: fuzz.partial_ratio(target.strip().lower(), x.lower()) - ) - if len(split) == 1: - if (table.score == table.score.max()).sum() > 1: - ixmax, rowmax = _break_ties(matchtarget, table) - else: - ixmax = table.score.idxmax() - rowmax = table.loc[ixmax] - if return_table: - return rowmax, table.sort_values("score") - return rowmax - - in_state = table.target.str.lower().str.endswith(state.strip().lower()) - - assert any(in_state), ( - "State {} is not found from place {}. " - "Should be a standard Census abbreviation, like" - " CA, AZ, NC, or PR".format(state, matchtarget) - ) - table = table[in_state] - if (table.score == table.score.max()).sum() > 1: - ixmax, rowmax = _break_ties(matchtarget, table) - else: - ixmax = table.score.idxmax() - rowmax = table.loc[ixmax] - if return_table: - return rowmax, table.sort_values("score") - return rowmax - - -def _coerce(column, kind): - """ - Converty type of column to kind, or keep column unchanged - if that conversion fails. - """ - try: - return column.astype(kind) - except ValueError: - return column - - -def _replace_missing(column, missings=_ACS_MISSING): - """ - replace ACS missing values using numpy.nan. - """ - for val in _ACS_MISSING: - column.replace(val, numpy.nan, inplace=True) - return column - - -def _break_ties(matchtarget, table): - """ - break ties in the fuzzy matching algorithm using a second scoring method - which prioritizes full string matches over substring matches. - """ - split = matchtarget.split(",") - if len(split) == 2: - target, state = split - else: - target = split[0] - table["score2"] = table.target.apply( - lambda x: fuzz.ratio(target.strip().lower(), x.lower()) - ) - among_winners = table[table.score == table.score.max()] - double_winners = among_winners[among_winners.score2 == among_winners.score2.max()] - if double_winners.shape[0] > 1: - ixmax = double_winners.score2.idxmax() - ixmax_row = double_winners.loc[ixmax] - warn( - "Cannot disambiguate placename {}. Picking the shortest, best " - "matched placename, {}, from {}".format( - matchtarget, ixmax_row.target, ", ".join(double_winners.target.tolist()) - ) - ) - return ixmax, ixmax_row - ixmax = double_winners.score2.idxmax() - return ixmax, double_winners.loc[ixmax] - - -def _can_int(char): - """check if a character can be turned into an integer""" - try: - int(char) - return True - except ValueError: - return False diff --git a/cenpy/remote.py b/cenpy/remote.py index 4371735..314b0c5 100644 --- a/cenpy/remote.py +++ b/cenpy/remote.py @@ -6,6 +6,7 @@ from . import tiger as tig import math from six import iteritems, PY3 +from .utilities import _coerce if PY3: unicode = str @@ -220,7 +221,7 @@ def query(self, cols=None, geo_unit="", geo_filter={}, apikey="", **kwargs): df = pd.DataFrame().from_records(json_content[1:], columns=json_content[0]) assert all([col in df.columns for col in cols]) if convert_numeric: - df = df.infer_objects() + df[cols] = _coerce(df[cols], int) if index is not "": df.index = df[index] return df diff --git a/cenpy/tests/test_utilities.py b/cenpy/tests/test_utilities.py new file mode 100644 index 0000000..9e660b8 --- /dev/null +++ b/cenpy/tests/test_utilities.py @@ -0,0 +1,55 @@ +import unittest +import pandas +import numpy +from cenpy.utilities import _coerce as coerce +from cenpy.utilities import _replace_missing as replace_missing + +class TestUtilities(unittest.TestCase): + + def test_coerce(self): + # Make sure coerce works on Series and doesn't change them + ser_orig = pandas.Series([3,4,5]) + ser_floats = coerce(ser_orig, cast_to = numpy.float64) + self.assertFalse(ser_orig.equals(ser_floats)) + + # Make sure coerce changes what columns it can and doesn't alter + # original data + df_orig = pandas.DataFrame({"ints": [1,2,3], + "floats": [0.1, 3.79, 14.9], + "strings": ["fst", "sec", "thd"]}) + df_floats = coerce(df_orig, cast_to = numpy.float64) + # Correct types of columns after coercion: + float_dtypes = pandas.Series(["float64", "float64", "object"], + index = ["ints", "floats", "strings"]) + # Make sure that the coerced dtypes are as expected + self.assertFalse(df_orig.equals(df_floats)) + self.assertTrue(float_dtypes.equals(df_floats.dtypes)) + + # Cast castable columns into strings - + # Confusingly enough, pandas calls them "objects" + df_objects = coerce(df_orig, cast_to = str) + object_dtypes = pandas.Series(["object", "object", "object"], + index = ["ints", "floats", "strings"]) + self.assertTrue(object_dtypes.equals(df_objects.dtypes)) + + # Make sure an error gets raised if a non-Series/DataFrame object is used + arr = numpy.zeros((2,2)) + self.assertRaises(TypeError, coerce, arr) + + + def test_replace_missing(self): + df_orig = pandas.DataFrame({"ints": [-888888888,2,3], + "floats": [-555555555, 3.79, -333333333]}) + df_replaced = replace_missing(df_orig) + # Correct output after replacing missing values + df_correct = pandas.DataFrame({"ints": [numpy.nan,2,3], + "floats": [numpy.nan, 3.79, numpy.nan]}) + self.assertTrue(df_replaced.equals(df_correct)) + + # Make sure an error is raised if non-Series/DataFrame types are used + arr = numpy.zeros((2,2)) + self.assertRaises(TypeError, replace_missing, arr) + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/cenpy/tiger.py b/cenpy/tiger.py index 8dfa04d..be5975c 100644 --- a/cenpy/tiger.py +++ b/cenpy/tiger.py @@ -262,9 +262,9 @@ def query(self, **kwargs): """ layer_result = kwargs.pop("layer", None) if isinstance(layer_result, str): - from .products import _fuzzy_match + from .utilities import _fuzzy_match as fuzzy_match - layer_result = _fuzzy_match( + layer_result = fuzzy_match( layer_result, [f.__repr__() for f in self.layers] ).index if layer_result is None: diff --git a/cenpy/utilities.py b/cenpy/utilities.py new file mode 100644 index 0000000..82845f5 --- /dev/null +++ b/cenpy/utilities.py @@ -0,0 +1,165 @@ +import pandas +import numpy +from fuzzywuzzy import fuzz + + +def _fuzzy_match(matchtarget, matchlist, return_table=False): + """ + Conduct a fuzzy match with matchtarget, within the list of possible match candidates in matchlist. + + Parameters + --------- + matchtarget : str + a string to be matched to a set of possible candidates + matchlist : list of str + a list (or iterable) containing strings we are interested in matching + return_table: bool + whether to return the full table of scored candidates, or to return only the single + best match. If False (the default), only the best match is returned. + + Notes + ----- + consult the docstring for Product.check_match for more information on how the actual matching + algorithm works. + """ + split = matchtarget.split(",") + if len(split) == 2: + target, state = split + elif len(split) == 1: + target = split[0] + else: + raise AssertionError( + "Uncertain place identifier {}. The place identifier should " + 'look something like "placename, state" or, for larger areas, ' + "like Combined Statistical Areas or Metropolitan Statistical Areas," + "placename1-placename2, state1-state2-state3".format(target) + ) + + table = pandas.DataFrame({"target": matchlist}) + table["score"] = table.target.apply( + lambda x: fuzz.partial_ratio(target.strip().lower(), x.lower()) + ) + if len(split) == 1: + if (table.score == table.score.max()).sum() > 1: + ixmax, rowmax = _break_ties(matchtarget, table) + else: + ixmax = table.score.idxmax() + rowmax = table.loc[ixmax] + if return_table: + return rowmax, table.sort_values("score") + return rowmax + + in_state = table.target.str.lower().str.endswith(state.strip().lower()) + + assert any(in_state), ( + "State {} is not found from place {}. " + "Should be a standard Census abbreviation, like" + " CA, AZ, NC, or PR".format(state, matchtarget) + ) + table = table[in_state] + if (table.score == table.score.max()).sum() > 1: + ixmax, rowmax = _break_ties(matchtarget, table) + else: + ixmax = table.score.idxmax() + rowmax = table.loc[ixmax] + if return_table: + return rowmax, table.sort_values("score") + return rowmax + + +def _coerce(data, cast_to = numpy.float64): + """ + Convert each column of data to cast_to. If a conversion of a column fails, move onto + the next column. + + Parameters + ---------- + data : DataFrame or Series + + cast_to : type, default numpy.float64 + One of: numpy.int8, numpy.float64, str, int, etc.. + + Returns + ------- + data with columns casted to specified type + """ + if isinstance(data, pandas.DataFrame): + data = data.copy() # Don't operate on user's data + for column in data.columns: + data[column] = _coerce(data[column], cast_to = cast_to) + return data + elif isinstance(data, pandas.Series): + try: + return data.astype(cast_to) + except: + return data + else: + raise TypeError("_coerce is designed to only work" + "with pandas DataFrames and Series") + + +def _replace_missing(data): + + """ + Replace ACS missing values using numpy.nan. + + Parameters + ---------- + data : DataFrame or Series + + Returns + ------- + data with missing values changed to numpy.nans + """ + + acs_missing = [-999999999, -888888888, -666666666, + -555555555, -333333333, -222222222] + + if isinstance(data, pandas.DataFrame): + data = data.copy() + for column in data.columns: + data[column] = _replace_missing(data[column]) + return data + elif isinstance(data, pandas.Series): + return data.replace(acs_missing, numpy.nan) + else: + raise TypeError("_replace_missing is designed to only work" + "with pandas DataFrames and Series") + + +def _break_ties(matchtarget, table): + """ + break ties in the fuzzy matching algorithm using a second scoring method + which prioritizes full string matches over substring matches. + """ + split = matchtarget.split(",") + if len(split) == 2: + target, state = split + else: + target = split[0] + table["score2"] = table.target.apply( + lambda x: fuzz.ratio(target.strip().lower(), x.lower()) + ) + among_winners = table[table.score == table.score.max()] + double_winners = among_winners[among_winners.score2 == among_winners.score2.max()] + if double_winners.shape[0] > 1: + ixmax = double_winners.score2.idxmax() + ixmax_row = double_winners.loc[ixmax] + warn( + "Cannot disambiguate placename {}. Picking the shortest, best " + "matched placename, {}, from {}".format( + matchtarget, ixmax_row.target, ", ".join(double_winners.target.tolist()) + ) + ) + return ixmax, ixmax_row + ixmax = double_winners.score2.idxmax() + return ixmax, double_winners.loc[ixmax] + + +def _can_int(char): + """check if a character can be turned into an integer""" + try: + int(char) + return True + except ValueError: + return False