changed it to build dataset with a few more variables master
authorBenjamin Mako Hill <mako@atdot.cc>
Mon, 10 Apr 2023 22:54:37 +0000 (15:54 -0700)
committerBenjamin Mako Hill <mako@atdot.cc>
Mon, 10 Apr 2023 22:55:01 +0000 (15:55 -0700)
BabyNames.ipynb
ssadata.py

index 775b9a135d75906347609167a1ec8de59b7fc3d1..8a8247652dce981a2327dc658122c20df6101e9d 100644 (file)
@@ -37,7 +37,7 @@
    "source": [
     "for name in boys.keys():\n",
     "    if name in girls.keys():\n",
    "source": [
     "for name in boys.keys():\n",
     "    if name in girls.keys():\n",
-    "        print(name)\n"
+    "        print(name)"
    ]
   },
   {
    ]
   },
   {
     "    if 'queen' == name:\n",
     "        print(name + \" \" + str(girls[name]))"
    ]
     "    if 'queen' == name:\n",
     "        print(name + \" \" + str(girls[name]))"
    ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Now practice your new skills.  Here are some examples of questions you might ask. If you think of any other questions that you think it might be interesting to answer then you should also try to answer those as well!\n",
-    "\n",
-    "\n",
-    "Search for your own name. Are there both boys and girls that have your name? Is it more popular for one group than for the other?\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "\n",
-    "Are there more boys' names or girls' names? What about for particular first letters? What about for ''every'' first letter?\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "What is the longest name in the dataset?\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "What is the most common name and how often does it occur?"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "What is the least common name and how often does it occur? Does that concern you?"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "How many boys and girls are described in the dataset (i.e., how many boys and girls born in 2018 have names given to at least four others)?\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "How many boys' names are also girls' names? How many girls' names are also boys' names?"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "How many names are subsets of other names?"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "\n",
-    "What is the most popular girls' name that is also a boys' name?\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Write a function that takes a prefix as input and prints the number of boys and girls with that prefix (e.g., get_names(\"m\") would list all names that start with \"m\" and get_names(\"ma\") would only list those that start with \"ma\")."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
   }
  ],
  "metadata": {
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-
    "version": "3.9.2"
   }
  },
    "version": "3.9.2"
   }
  },
index 6fabb39a25a97931059e16d9845b53ef3efa8d11..35fabe9435cc9ed292a26ddeb2af87de350cad70 100644 (file)
@@ -1,13 +1,34 @@
-NAMES_LIST = "yob2018.txt"
+import glob
+import re
 
 
-boys = {}
-girls = {}
+def import_yob_file(filename):
+    boys = {}
+    girls = {}
 
 
-for line in open(NAMES_LIST, 'r').readlines():
-    name, gender, count = line.strip().split(",")
-    count = int(count)
+    with open(filename, "r") as f:
+        for line in f.readlines():
+            name, gender, count = line.strip().split(",")
+            count = int(count)
+            if gender == "F":
+                girls[name.lower()] = count
+            elif gender == "M":
+                boys[name.lower()] = count
+                
+    return((boys, girls))
 
 
-    if gender == "F":
-        girls[name.lower()] = count
-    elif gender == "M":
-        boys[name.lower()] = count
+years = {}
+for filename in glob.glob('yob*.txt'):
+    year = re.match(r'yob(\d{4})\.txt$', filename).group(1)
+    tmp_boys, tmp_girls = import_yob_file(filename)
+    years[year] = {'girls' : tmp_girls,
+                   'boys' : tmp_boys}
+
+## resort by year
+years = dict(sorted(years.items()))
+
+girls = years["2021"]["girls"]
+boys = years["2021"]["boys"]
+
+
+    
+    

Benjamin Mako Hill || Want to submit a patch?