# `pandas` is a popular library to manipulate data and perform stat analyses

### First, a few preliminary functions are needed:

In [2]:
# we're creating an alias for pandas called "pd". This is the generally used alias.
import pandas as pd

# now, let's import our first dataset!
gdp_data = pd.read_csv("data/gapminder_gdp_oceania.csv")

In [17]:
# In the output, the columns are variables, the rows are observations
# The backslashes "\" indicate new lines
print(gdp_data)

       country  gdpPercap_1952  gdpPercap_1957  gdpPercap_1962  \
0    Australia     10039.59564     10949.64959     12217.22686   
1  New Zealand     10556.57566     12247.39532     13175.67800   

   gdpPercap_1967  gdpPercap_1972  gdpPercap_1977  gdpPercap_1982  \
0     14526.12465     16788.62948     18334.19751     19477.00928   
1     14463.91893     16046.03728     16233.71770     17632.41040   

   gdpPercap_1987  gdpPercap_1992  gdpPercap_1997  gdpPercap_2002  \
0     21888.88903     23424.76683     26997.93657     30687.75473   
1     19007.19129     18363.32494     21050.41377     23189.80135   

   gdpPercap_2007  
0     34435.36744  
1     25185.00911  


### Be careful of the file path! Note where in the file system you are in and then find where you stored the data

In [7]:
# This is one package you can use
from os import getcwd

In [8]:
getcwd()

'/Users/vaniawang/gDrive_main/grad-school/PhD-UCSB/teaching/swc-python2019'

# Let's organize this data a bit first...

### Let's print out the dataset and take a look again:

In [18]:
print(gdp_data)

       country  gdpPercap_1952  gdpPercap_1957  gdpPercap_1962  \
0    Australia     10039.59564     10949.64959     12217.22686   
1  New Zealand     10556.57566     12247.39532     13175.67800   

   gdpPercap_1967  gdpPercap_1972  gdpPercap_1977  gdpPercap_1982  \
0     14526.12465     16788.62948     18334.19751     19477.00928   
1     14463.91893     16046.03728     16233.71770     17632.41040   

   gdpPercap_1987  gdpPercap_1992  gdpPercap_1997  gdpPercap_2002  \
0     21888.88903     23424.76683     26997.93657     30687.75473   
1     19007.19129     18363.32494     21050.41377     23189.80135   

   gdpPercap_2007  
0     34435.36744  
1     25185.00911  


### Take a look at the first column, these are supposed to be row headings. Let's re-import the data to accomodate for this:

In [19]:
gdp_data = pd.read_csv("data/gapminder_gdp_oceania.csv", index_col = "country")

In [20]:
print(gdp_data)

             gdpPercap_1952  gdpPercap_1957  gdpPercap_1962  gdpPercap_1967  \
country                                                                       
Australia       10039.59564     10949.64959     12217.22686     14526.12465   
New Zealand     10556.57566     12247.39532     13175.67800     14463.91893   

             gdpPercap_1972  gdpPercap_1977  gdpPercap_1982  gdpPercap_1987  \
country                                                                       
Australia       16788.62948     18334.19751     19477.00928     21888.88903   
New Zealand     16046.03728     16233.71770     17632.41040     19007.19129   

             gdpPercap_1992  gdpPercap_1997  gdpPercap_2002  gdpPercap_2007  
country                                                                      
Australia       23424.76683     26997.93657     30687.75473     34435.36744  
New Zealand     18363.32494     21050.41377     23189.80135     25185.00911  


### Here's some overall info about the data:

In [22]:
gdp_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2 entries, Australia to New Zealand
Data columns (total 12 columns):
gdpPercap_1952    2 non-null float64
gdpPercap_1957    2 non-null float64
gdpPercap_1962    2 non-null float64
gdpPercap_1967    2 non-null float64
gdpPercap_1972    2 non-null float64
gdpPercap_1977    2 non-null float64
gdpPercap_1982    2 non-null float64
gdpPercap_1987    2 non-null float64
gdpPercap_1992    2 non-null float64
gdpPercap_1997    2 non-null float64
gdpPercap_2002    2 non-null float64
gdpPercap_2007    2 non-null float64
dtypes: float64(12)
memory usage: 208.0+ bytes


### So, this data type is a _data frame_. Next, let's talk about basic properties of a data frame.

# Properties of data frames

### Let's talk more about data frame columns

In [24]:
# Note that the bottom call does not include parentheses (gdp_data.columns vs gdp_data.columns()).
# Why? We're seeking info about a property of gdp_data. We aren't performing any operations on the data frame itself.
gdp_data.columns

Index(['gdpPercap_1952', 'gdpPercap_1957', 'gdpPercap_1962', 'gdpPercap_1967',
       'gdpPercap_1972', 'gdpPercap_1977', 'gdpPercap_1982', 'gdpPercap_1987',
       'gdpPercap_1992', 'gdpPercap_1997', 'gdpPercap_2002', 'gdpPercap_2007'],
      dtype='object')

### Transposing: sometimes we want to treat columns as rows, and vice versa

In [26]:
print(gdp_data.T)

country           Australia  New Zealand
gdpPercap_1952  10039.59564  10556.57566
gdpPercap_1957  10949.64959  12247.39532
gdpPercap_1962  12217.22686  13175.67800
gdpPercap_1967  14526.12465  14463.91893
gdpPercap_1972  16788.62948  16046.03728
gdpPercap_1977  18334.19751  16233.71770
gdpPercap_1982  19477.00928  17632.41040
gdpPercap_1987  21888.88903  19007.19129
gdpPercap_1992  23424.76683  18363.32494
gdpPercap_1997  26997.93657  21050.41377
gdpPercap_2002  30687.75473  23189.80135
gdpPercap_2007  34435.36744  25185.00911


### What about some summary statistics about the data? Let's use `[DataFrame].describe()`

In [27]:
print(gdp_data.describe())

       gdpPercap_1952  gdpPercap_1957  gdpPercap_1962  gdpPercap_1967  \
count        2.000000        2.000000        2.000000        2.000000   
mean     10298.085650    11598.522455    12696.452430    14495.021790   
std        365.560078      917.644806      677.727301       43.986086   
min      10039.595640    10949.649590    12217.226860    14463.918930   
25%      10168.840645    11274.086022    12456.839645    14479.470360   
50%      10298.085650    11598.522455    12696.452430    14495.021790   
75%      10427.330655    11922.958888    12936.065215    14510.573220   
max      10556.575660    12247.395320    13175.678000    14526.124650   

       gdpPercap_1972  gdpPercap_1977  gdpPercap_1982  gdpPercap_1987  \
count         2.00000        2.000000        2.000000        2.000000   
mean      16417.33338    17283.957605    18554.709840    20448.040160   
std         525.09198     1485.263517     1304.328377     2037.668013   
min       16046.03728    16233.717700    17632.410

# _Challenge 1_

In [4]:
americas = pd.read_csv("data/gapminder_gdp_americas.csv", index_col = "country")

# _Challenge 2_

In [13]:
print(americas.head(n = 3))

          continent  gdpPercap_1952  gdpPercap_1957  gdpPercap_1962  \
country                                                               
Argentina  Americas     5911.315053     6856.856212     7133.166023   
Bolivia    Americas     2677.326347     2127.686326     2180.972546   
Brazil     Americas     2108.944355     2487.365989     3336.585802   

           gdpPercap_1967  gdpPercap_1972  gdpPercap_1977  gdpPercap_1982  \
country                                                                     
Argentina     8052.953021     9443.038526    10079.026740     8997.897412   
Bolivia       2586.886053     2980.331339     3548.097832     3156.510452   
Brazil        3429.864357     4985.711467     6660.118654     7030.835878   

           gdpPercap_1987  gdpPercap_1992  gdpPercap_1997  gdpPercap_2002  \
country                                                                     
Argentina     9139.671389     9308.418710    10967.281950     8797.640716   
Bolivia       2753.691490  

In [12]:
print(americas.T.tail(n = 3))

country        Argentina  Bolivia   Brazil   Canada    Chile Colombia  \
gdpPercap_1997   10967.3  3326.14  7957.98  28954.9  10118.1  6117.36   
gdpPercap_2002   8797.64  3413.26  8131.21    33329  10778.8  5755.26   
gdpPercap_2007   12779.4  3822.14   9065.8  36319.2  13171.6  7006.58   

country        Costa Rica     Cuba Dominican Republic  Ecuador    ...     \
gdpPercap_1997    6677.05  5431.99             3614.1  7429.46    ...      
gdpPercap_2002    7723.45  6340.65            4563.81  5773.04    ...      
gdpPercap_2007    9645.06   8948.1            6025.37  6873.26    ...      

country          Mexico Nicaragua   Panama Paraguay     Peru Puerto Rico  \
gdpPercap_1997   9767.3   2253.02  7113.69   4247.4  5838.35     16999.4   
gdpPercap_2002  10742.4   2474.55  7356.03  3783.67  5909.02     18855.6   
gdpPercap_2007  11977.6   2749.32  9809.19  4172.84  7408.91     19328.7   

country        Trinidad and Tobago United States  Uruguay Venezuela  
gdpPercap_1997             

# _Challenge 3_

In [14]:
americas.T.to_csv("americas_transposed.csv")