Skip to content

Commit

Permalink
Speed up make category uniques
Browse files Browse the repository at this point in the history
  • Loading branch information
ddxv committed Oct 14, 2024
1 parent 418d58e commit 83a37cc
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 84 deletions.
118 changes: 36 additions & 82 deletions backend/api_app/controllers/companies.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,8 @@ def get_overviews(category: str | None = None) -> CompaniesOverview:

overview_df = (
overview_df.groupby(
["company_name", "company_domain", "store", "tag_source"], dropna=False,
["company_name", "company_domain", "store", "tag_source"],
dropna=False,
)["app_count"]
.sum()
.reset_index()
Expand Down Expand Up @@ -251,91 +252,44 @@ def transform_group(group: pd.DataFrame) -> dict:
def make_category_uniques(df: pd.DataFrame) -> CategoryOverview:
"""Make category sums for overview."""
overview = CategoryOverview()
conditions = {
"sdk_ios": (df["store"].str.contains("Apple")) & (df["tag_source"] == "sdk"),
"sdk_android": (df["store"].str.contains("Google"))
& (df["tag_source"] == "sdk"),
"adstxt_ios": (df["store"].str.contains("Apple"))
& (df["tag_source"] == "app_ads"),
"adstxt_android": (df["store"].str.contains("Google"))
& (df["tag_source"] == "app_ads"),
}

# Calculate sums for all conditions in one go
results = {
key: df.loc[condition, "company_domain"].nunique()
for key, condition in conditions.items()
# Precompute boolean masks
is_apple = df["store"].str.contains("Apple")
is_google = df["store"].str.contains("Google")
is_sdk = df["tag_source"] == "sdk"
is_app_ads = df["tag_source"] == "app_ads"

# Function to calculate unique counts
def get_unique_counts(mask: pd.Series) -> int:
return df.loc[mask, "company_domain"].nunique()

# Calculate overall stats
overall_stats = {
"total_apps": df["company_domain"].nunique(),
"sdk_ios_total_apps": get_unique_counts(is_apple & is_sdk),
"sdk_android_total_apps": get_unique_counts(is_google & is_sdk),
"adstxt_ios_total_apps": get_unique_counts(is_apple & is_app_ads),
"adstxt_android_total_apps": get_unique_counts(is_google & is_app_ads),
}

# Unpack results
(
sdk_ios_total_apps,
sdk_android_total_apps,
adstxt_ios_total_apps,
adstxt_android_total_apps,
) = (
results["sdk_ios"],
results["sdk_android"],
results["adstxt_ios"],
results["adstxt_android"],
)

total_apps = df["company_domain"].nunique()

overview.update_stats(
"all",
total_apps=total_apps,
adstxt_ios_total_apps=adstxt_ios_total_apps,
adstxt_android_total_apps=adstxt_android_total_apps,
sdk_ios_total_apps=sdk_ios_total_apps,
sdk_android_total_apps=sdk_android_total_apps,
)
cats = df.app_category.unique().tolist()
for cat in cats:
conditions = {
"sdk_ios": (df["store"].str.contains("Apple"))
& (df["tag_source"] == "sdk")
& (df["app_category"] == cat),
"sdk_android": (df["store"].str.contains("Google"))
& (df["tag_source"] == "sdk")
& (df["app_category"] == cat),
"adstxt_ios": (df["store"].str.contains("Apple"))
& (df["tag_source"] == "app_ads")
& (df["app_category"] == cat),
"adstxt_android": (df["store"].str.contains("Google"))
& (df["tag_source"] == "app_ads")
& (df["app_category"] == cat),
}

# Calculate sums for all conditions in one go
results = {
key: df.loc[condition, "company_domain"].nunique()
for key, condition in conditions.items()
overview.update_stats("all", **overall_stats)

# Calculate stats for each category
categories = df["app_category"].unique()
for cat in categories:
cat_mask = df["app_category"] == cat
cat_stats = {
"total_apps": get_unique_counts(cat_mask),
"sdk_ios_total_apps": get_unique_counts(cat_mask & is_apple & is_sdk),
"sdk_android_total_apps": get_unique_counts(cat_mask & is_google & is_sdk),
"adstxt_ios_total_apps": get_unique_counts(
cat_mask & is_apple & is_app_ads,
),
"adstxt_android_total_apps": get_unique_counts(
cat_mask & is_google & is_app_ads,
),
}
overview.update_stats(cat, **cat_stats)

# Unpack results
(
sdk_ios_total_apps,
sdk_android_total_apps,
adstxt_ios_total_apps,
adstxt_android_total_apps,
) = (
results["sdk_ios"],
results["sdk_android"],
results["adstxt_ios"],
results["adstxt_android"],
)

total_apps = df[df["app_category"] == cat]["company_domain"].nunique()

overview.update_stats(
cat,
total_apps=total_apps,
adstxt_ios_total_apps=adstxt_ios_total_apps,
adstxt_android_total_apps=adstxt_android_total_apps,
sdk_ios_total_apps=sdk_ios_total_apps,
sdk_android_total_apps=sdk_android_total_apps,
)
return overview


Expand Down
3 changes: 1 addition & 2 deletions backend/api_app/models.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""Data models for APIs."""

from dataclasses import dataclass, field
from typing import Any


@dataclass
Expand Down Expand Up @@ -220,7 +219,7 @@ def add_category(self, category: str) -> None:
if category not in self.categories:
self.categories[category] = CategoryAppStats()

def update_stats(self, category: str, **kwargs: dict[str, Any]) -> None:
def update_stats(self, category: str, **kwargs: int) -> None:
"""Update the stats for a category."""
if category not in self.categories:
self.add_category(category)
Expand Down

0 comments on commit 83a37cc

Please sign in to comment.