# libraries
import pandas as pd
import numpy as np
import psutil
import gc
import pyarrow.parquet as pq
import pyarrow as pa
pd.options.display.max_columns = 200
pd.options.display.max_rows = 200
pd.options.display.max_info_columns = 300Chunks in Pandas, PyArrow, and Parquet - Amex Competition

def available_memory_gb():
return psutil.virtual_memory().available / (1024**3)Introduction - The AMEX Default Prediction Competition
The AMEX default prediction competition in Kaggle is a binary classification problem where the goal is to predict whether a credit card holder will default on their payment within 120 days after the latest credit card statement based on data from a 18-month performance window after the latest credit card statement.
The first step in any data science project is to load the data and understand its structure. The problem here is that the dataset is 50GB in size and it is not possible to load it all into memory at once (at least not in my machine). So here is where headaches start to appear.
In this notebook, I will show how to load the data in chunks using Pandas and PyArrow, and how to save it in Parquet format. This way, we can load the data in chunks and avoid memory issues.
You don’t need to load the data all at once
The first thing to understand is that you don’t need to load the data all at once. You can have a first glance to the data by loading only a small sample of it. This will give you an idea of the data structure and help you to decide what to do next.
PATH = '/home/jmanu/git/amex_competition/train_data.csv'
data = pd.read_csv(PATH, nrows=35000)
data.head(5)| customer_ID | S_2 | P_2 | D_39 | B_1 | B_2 | R_1 | S_3 | D_41 | B_3 | D_42 | D_43 | D_44 | B_4 | D_45 | B_5 | R_2 | D_46 | D_47 | D_48 | D_49 | B_6 | B_7 | B_8 | D_50 | D_51 | B_9 | R_3 | D_52 | P_3 | B_10 | D_53 | S_5 | B_11 | S_6 | D_54 | R_4 | S_7 | B_12 | S_8 | D_55 | D_56 | B_13 | R_5 | D_58 | S_9 | B_14 | D_59 | D_60 | D_61 | B_15 | S_11 | D_62 | D_63 | D_64 | D_65 | B_16 | B_17 | B_18 | B_19 | D_66 | B_20 | D_68 | S_12 | R_6 | S_13 | B_21 | D_69 | B_22 | D_70 | D_71 | D_72 | S_15 | B_23 | D_73 | P_4 | D_74 | D_75 | D_76 | B_24 | R_7 | D_77 | B_25 | B_26 | D_78 | D_79 | R_8 | R_9 | S_16 | D_80 | R_10 | R_11 | B_27 | D_81 | D_82 | S_17 | R_12 | B_28 | R_13 | D_83 | R_14 | R_15 | D_84 | R_16 | B_29 | B_30 | S_18 | D_86 | D_87 | R_17 | R_18 | D_88 | B_31 | S_19 | R_19 | B_32 | S_20 | R_20 | R_21 | B_33 | D_89 | R_22 | R_23 | D_91 | D_92 | D_93 | D_94 | R_24 | R_25 | D_96 | S_22 | S_23 | S_24 | S_25 | S_26 | D_102 | D_103 | D_104 | D_105 | D_106 | D_107 | B_36 | B_37 | R_26 | R_27 | B_38 | D_108 | D_109 | D_110 | D_111 | B_39 | D_112 | B_40 | S_27 | D_113 | D_114 | D_115 | D_116 | D_117 | D_118 | D_119 | D_120 | D_121 | D_122 | D_123 | D_124 | D_125 | D_126 | D_127 | D_128 | D_129 | B_41 | B_42 | D_130 | D_131 | D_132 | D_133 | R_28 | D_134 | D_135 | D_136 | D_137 | D_138 | D_139 | D_140 | D_141 | D_142 | D_143 | D_144 | D_145 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f... | 2017-03-09 | 0.938469 | 0.001733 | 0.008724 | 1.006838 | 0.009228 | 0.124035 | 0.008771 | 0.004709 | NaN | NaN | 0.000630 | 0.080986 | 0.708906 | 0.170600 | 0.006204 | 0.358587 | 0.525351 | 0.255736 | NaN | 0.063902 | 0.059416 | 0.006466 | 0.148698 | 1.335856 | 0.008207 | 0.001423 | 0.207334 | 0.736463 | 0.096219 | NaN | 0.023381 | 0.002768 | 0.008322 | 1.001519 | 0.008298 | 0.161345 | 0.148266 | 0.922998 | 0.354596 | 0.152025 | 0.118075 | 0.001882 | 0.158612 | 0.065728 | 0.018385 | 0.063646 | 0.199617 | 0.308233 | 0.016361 | 0.401619 | 0.091071 | CR | O | 0.007126 | 0.007665 | NaN | 0.652984 | 0.008520 | NaN | 0.004730 | 6.0 | 0.272008 | 0.008363 | 0.515222 | 0.002644 | 0.009013 | 0.004808 | 0.008342 | 0.119403 | 0.004802 | 0.108271 | 0.050882 | NaN | 0.007554 | 0.080422 | 0.069067 | NaN | 0.004327 | 0.007562 | NaN | 0.007729 | 0.000272 | 0.001576 | 0.004239 | 0.001434 | NaN | 0.002271 | 0.004061 | 0.007121 | 0.002456 | 0.002310 | 0.003532 | 0.506612 | 0.008033 | 1.009825 | 0.084683 | 0.003820 | 0.007043 | 0.000438 | 0.006452 | 0.000830 | 0.005055 | NaN | 0.0 | 0.005720 | 0.007084 | NaN | 0.000198 | 0.008907 | NaN | 1 | 0.002537 | 0.005177 | 0.006626 | 0.009705 | 0.007782 | 0.002450 | 1.001101 | 0.002665 | 0.007479 | 0.006893 | 1.503673 | 1.006133 | 0.003569 | 0.008871 | 0.003950 | 0.003647 | 0.004950 | 0.894090 | 0.135561 | 0.911191 | 0.974539 | 0.001243 | 0.766688 | 1.008691 | 1.004587 | 0.893734 | NaN | 0.670041 | 0.009968 | 0.004572 | NaN | 1.008949 | 2.0 | NaN | 0.004326 | NaN | NaN | NaN | 1.007336 | 0.210060 | 0.676922 | 0.007871 | 1.0 | 0.238250 | 0.0 | 4.0 | 0.232120 | 0.236266 | 0.0 | 0.702280 | 0.434345 | 0.003057 | 0.686516 | 0.008740 | 1.0 | 1.003319 | 1.007819 | 1.000080 | 0.006805 | NaN | 0.002052 | 0.005972 | NaN | 0.004345 | 0.001535 | NaN | NaN | NaN | NaN | NaN | 0.002427 | 0.003706 | 0.003818 | NaN | 0.000569 | 0.000610 | 0.002674 |
| 1 | 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f... | 2017-04-07 | 0.936665 | 0.005775 | 0.004923 | 1.000653 | 0.006151 | 0.126750 | 0.000798 | 0.002714 | NaN | NaN | 0.002526 | 0.069419 | 0.712795 | 0.113239 | 0.006206 | 0.353630 | 0.521311 | 0.223329 | NaN | 0.065261 | 0.057744 | 0.001614 | 0.149723 | 1.339794 | 0.008373 | 0.001984 | 0.202778 | 0.720886 | 0.099804 | NaN | 0.030599 | 0.002749 | 0.002482 | 1.009033 | 0.005136 | 0.140951 | 0.143530 | 0.919414 | 0.326757 | 0.156201 | 0.118737 | 0.001610 | 0.148459 | 0.093935 | 0.013035 | 0.065501 | 0.151387 | 0.265026 | 0.017688 | 0.406326 | 0.086805 | CR | O | 0.002413 | 0.007148 | NaN | 0.647093 | 0.002238 | NaN | 0.003879 | 6.0 | 0.188970 | 0.004030 | 0.509048 | 0.004193 | 0.007842 | 0.001283 | 0.006524 | 0.140611 | 0.000094 | 0.101018 | 0.040469 | NaN | 0.004832 | 0.081413 | 0.074166 | NaN | 0.004203 | 0.005304 | NaN | 0.001864 | 0.000979 | 0.009896 | 0.007597 | 0.000509 | NaN | 0.009810 | 0.000127 | 0.005966 | 0.000395 | 0.001327 | 0.007773 | 0.500855 | 0.000760 | 1.009461 | 0.081843 | 0.000347 | 0.007789 | 0.004311 | 0.002332 | 0.009469 | 0.003753 | NaN | 0.0 | 0.007584 | 0.006677 | NaN | 0.001142 | 0.005907 | NaN | 1 | 0.008427 | 0.008979 | 0.001854 | 0.009924 | 0.005987 | 0.002247 | 1.006779 | 0.002508 | 0.006827 | 0.002837 | 1.503577 | 1.005791 | 0.000571 | 0.000391 | 0.008351 | 0.008850 | 0.003180 | 0.902135 | 0.136333 | 0.919876 | 0.975624 | 0.004561 | 0.786007 | 1.000084 | 1.004118 | 0.906841 | NaN | 0.668647 | 0.003921 | 0.004654 | NaN | 1.003205 | 2.0 | NaN | 0.008707 | NaN | NaN | NaN | 1.007653 | 0.184093 | 0.822281 | 0.003444 | 1.0 | 0.247217 | 0.0 | 4.0 | 0.243532 | 0.241885 | 0.0 | 0.707017 | 0.430501 | 0.001306 | 0.686414 | 0.000755 | 1.0 | 1.008394 | 1.004333 | 1.008344 | 0.004407 | NaN | 0.001034 | 0.004838 | NaN | 0.007495 | 0.004931 | NaN | NaN | NaN | NaN | NaN | 0.003954 | 0.003167 | 0.005032 | NaN | 0.009576 | 0.005492 | 0.009217 |
| 2 | 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f... | 2017-05-28 | 0.954180 | 0.091505 | 0.021655 | 1.009672 | 0.006815 | 0.123977 | 0.007598 | 0.009423 | NaN | NaN | 0.007605 | 0.068839 | 0.720884 | 0.060492 | 0.003259 | 0.334650 | 0.524568 | 0.189424 | NaN | 0.066982 | 0.056647 | 0.005126 | 0.151955 | 1.337179 | 0.009355 | 0.007426 | 0.206629 | 0.738044 | 0.134073 | NaN | 0.048367 | 0.010077 | 0.000530 | 1.009184 | 0.006961 | 0.112229 | 0.137014 | 1.001977 | 0.304124 | 0.153795 | 0.114534 | 0.006328 | 0.139504 | 0.084757 | 0.056653 | 0.070607 | 0.305883 | 0.212165 | 0.063955 | 0.406768 | 0.094001 | CR | O | 0.001878 | 0.003636 | NaN | 0.645819 | 0.000408 | NaN | 0.004578 | 6.0 | 0.495308 | 0.006838 | 0.679257 | 0.001337 | 0.006025 | 0.009393 | 0.002615 | 0.075868 | 0.007152 | 0.103239 | 0.047454 | NaN | 0.006561 | 0.078891 | 0.076510 | NaN | 0.001782 | 0.001422 | NaN | 0.005419 | 0.006149 | 0.009629 | 0.003094 | 0.008295 | NaN | 0.009362 | 0.000954 | 0.005447 | 0.007345 | 0.007624 | 0.008811 | 0.504606 | 0.004056 | 1.004291 | 0.081954 | 0.002709 | 0.004093 | 0.007139 | 0.008358 | 0.002325 | 0.007381 | NaN | 0.0 | 0.005901 | 0.001185 | NaN | 0.008013 | 0.008882 | NaN | 1 | 0.007327 | 0.002016 | 0.008686 | 0.008446 | 0.007291 | 0.007794 | 1.001014 | 0.009634 | 0.009820 | 0.005080 | 1.503359 | 1.005801 | 0.007425 | 0.009234 | 0.002471 | 0.009769 | 0.005433 | 0.939654 | 0.134938 | 0.958699 | 0.974067 | 0.011736 | 0.806840 | 1.003014 | 1.009285 | 0.928719 | NaN | 0.670901 | 0.001264 | 0.019176 | NaN | 1.000754 | 2.0 | NaN | 0.004092 | NaN | NaN | NaN | 1.004312 | 0.154837 | 0.853498 | 0.003269 | 1.0 | 0.239867 | 0.0 | 4.0 | 0.240768 | 0.239710 | 0.0 | 0.704843 | 0.434409 | 0.003954 | 0.690101 | 0.009617 | 1.0 | 1.009307 | 1.007831 | 1.006878 | 0.003221 | NaN | 0.005681 | 0.005497 | NaN | 0.009227 | 0.009123 | NaN | NaN | NaN | NaN | NaN | 0.003269 | 0.007329 | 0.000427 | NaN | 0.003429 | 0.006986 | 0.002603 |
| 3 | 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f... | 2017-06-13 | 0.960384 | 0.002455 | 0.013683 | 1.002700 | 0.001373 | 0.117169 | 0.000685 | 0.005531 | NaN | NaN | 0.006406 | 0.055630 | 0.723997 | 0.166782 | 0.009918 | 0.323271 | 0.530929 | 0.135586 | NaN | 0.083720 | 0.049253 | 0.001418 | 0.151219 | 1.339909 | 0.006782 | 0.003515 | 0.208214 | 0.741813 | 0.134437 | NaN | 0.030063 | 0.009667 | 0.000783 | 1.007456 | 0.008706 | 0.102838 | 0.129017 | 0.704016 | 0.275055 | 0.155772 | 0.120740 | 0.004980 | 0.138100 | 0.048382 | 0.012498 | 0.065926 | 0.273553 | 0.204300 | 0.022732 | 0.405175 | 0.094854 | CR | O | 0.005899 | 0.005896 | NaN | 0.654358 | 0.005897 | NaN | 0.005207 | 6.0 | 0.508670 | 0.008183 | 0.515282 | 0.008716 | 0.005271 | 0.004554 | 0.002052 | 0.150209 | 0.005364 | 0.206394 | 0.031705 | NaN | 0.009559 | 0.077490 | 0.071547 | NaN | 0.005595 | 0.006363 | NaN | 0.000646 | 0.009193 | 0.008568 | 0.003895 | 0.005153 | NaN | 0.004876 | 0.005665 | 0.001888 | 0.004961 | 0.000034 | 0.004652 | 0.508998 | 0.006969 | 1.004728 | 0.060634 | 0.009982 | 0.008817 | 0.008690 | 0.007364 | 0.005924 | 0.008802 | NaN | 0.0 | 0.002520 | 0.003324 | NaN | 0.009455 | 0.008348 | NaN | 1 | 0.007053 | 0.003909 | 0.002478 | 0.006614 | 0.009977 | 0.007686 | 1.002775 | 0.007791 | 0.000458 | 0.007320 | 1.503701 | 1.007036 | 0.000664 | 0.003200 | 0.008507 | 0.004858 | 0.000063 | 0.913205 | 0.140058 | 0.926341 | 0.975499 | 0.007571 | 0.808214 | 1.001517 | 1.004514 | 0.935383 | NaN | 0.672620 | 0.002729 | 0.011720 | NaN | 1.005338 | 2.0 | NaN | 0.009703 | NaN | NaN | NaN | 1.002538 | 0.153939 | 0.844667 | 0.000053 | 1.0 | 0.240910 | 0.0 | 4.0 | 0.239400 | 0.240727 | 0.0 | 0.711546 | 0.436903 | 0.005135 | 0.687779 | 0.004649 | 1.0 | 1.001671 | 1.003460 | 1.007573 | 0.007703 | NaN | 0.007108 | 0.008261 | NaN | 0.007206 | 0.002409 | NaN | NaN | NaN | NaN | NaN | 0.006117 | 0.004516 | 0.003200 | NaN | 0.008419 | 0.006527 | 0.009600 |
| 4 | 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f... | 2017-07-16 | 0.947248 | 0.002483 | 0.015193 | 1.000727 | 0.007605 | 0.117325 | 0.004653 | 0.009312 | NaN | NaN | 0.007731 | 0.038862 | 0.720619 | 0.143630 | 0.006667 | 0.231009 | 0.529305 | NaN | NaN | 0.075900 | 0.048918 | 0.001199 | 0.154026 | 1.341735 | 0.000519 | 0.001362 | 0.205468 | 0.691986 | 0.121518 | NaN | 0.054221 | 0.009484 | 0.006698 | 1.003738 | 0.003846 | 0.094311 | 0.129539 | 0.917133 | 0.231110 | 0.154914 | 0.095178 | 0.001653 | 0.126443 | 0.039259 | 0.027897 | 0.063697 | 0.233103 | 0.175655 | 0.031171 | 0.487460 | 0.093915 | CR | O | 0.009479 | 0.001714 | NaN | 0.650112 | 0.007773 | NaN | 0.005851 | 6.0 | 0.216507 | 0.008605 | 0.507712 | 0.006821 | 0.000152 | 0.000104 | 0.001419 | 0.096441 | 0.007972 | 0.106020 | 0.032733 | NaN | 0.008156 | 0.076561 | 0.074432 | NaN | 0.004933 | 0.004831 | NaN | 0.001833 | 0.005738 | 0.003289 | 0.002608 | 0.007338 | NaN | 0.007447 | 0.004465 | 0.006111 | 0.002246 | 0.002109 | 0.001141 | 0.506213 | 0.001770 | 1.000904 | 0.062492 | 0.005860 | 0.001845 | 0.007816 | 0.002470 | 0.005516 | 0.007166 | NaN | 0.0 | 0.000155 | 0.001504 | NaN | 0.002019 | 0.002678 | NaN | 1 | 0.007728 | 0.003432 | 0.002199 | 0.005511 | 0.004105 | 0.009656 | 1.006536 | 0.005158 | 0.003341 | 0.000264 | 1.509905 | 1.002915 | 0.003079 | 0.003845 | 0.007190 | 0.002983 | 0.000535 | 0.921026 | 0.131620 | 0.933479 | 0.978027 | 0.018200 | 0.822281 | 1.006125 | 1.005735 | 0.953363 | NaN | 0.673869 | 0.009998 | 0.017598 | NaN | 1.003175 | 2.0 | NaN | 0.009120 | NaN | NaN | NaN | 1.000130 | 0.120717 | 0.811199 | 0.008724 | 1.0 | 0.247939 | 0.0 | 4.0 | 0.244199 | 0.242325 | 0.0 | 0.705343 | 0.437433 | 0.002849 | 0.688774 | 0.000097 | 1.0 | 1.009886 | 1.005053 | 1.008132 | 0.009823 | NaN | 0.009680 | 0.004848 | NaN | 0.006312 | 0.004462 | NaN | NaN | NaN | NaN | NaN | 0.003671 | 0.004946 | 0.008889 | NaN | 0.001670 | 0.008126 | 0.009827 |
Using the info() method, you can have a better sense of how the dataset is structured.
data.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35000 entries, 0 to 34999
Data columns (total 190 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 customer_ID 35000 non-null object
1 S_2 35000 non-null object
2 P_2 34690 non-null float64
3 D_39 35000 non-null float64
4 B_1 35000 non-null float64
5 B_2 34994 non-null float64
6 R_1 35000 non-null float64
7 S_3 28053 non-null float64
8 D_41 34994 non-null float64
9 B_3 34994 non-null float64
10 D_42 4889 non-null float64
11 D_43 24217 non-null float64
12 D_44 33279 non-null float64
13 B_4 35000 non-null float64
14 D_45 34994 non-null float64
15 B_5 35000 non-null float64
16 R_2 35000 non-null float64
17 D_46 27411 non-null float64
18 D_47 35000 non-null float64
19 D_48 30448 non-null float64
20 D_49 3808 non-null float64
21 B_6 34999 non-null float64
22 B_7 35000 non-null float64
23 B_8 34833 non-null float64
24 D_50 15101 non-null float64
25 D_51 35000 non-null float64
26 B_9 35000 non-null float64
27 R_3 35000 non-null float64
28 D_52 34832 non-null float64
29 P_3 33203 non-null float64
30 B_10 35000 non-null float64
31 D_53 9055 non-null float64
32 S_5 35000 non-null float64
33 B_11 35000 non-null float64
34 S_6 35000 non-null float64
35 D_54 34994 non-null float64
36 R_4 35000 non-null float64
37 S_7 28053 non-null float64
38 B_12 35000 non-null float64
39 S_8 35000 non-null float64
40 D_55 33844 non-null float64
41 D_56 16107 non-null float64
42 B_13 34687 non-null float64
43 R_5 35000 non-null float64
44 D_58 35000 non-null float64
45 S_9 16298 non-null float64
46 B_14 35000 non-null float64
47 D_59 34374 non-null float64
48 D_60 35000 non-null float64
49 D_61 31232 non-null float64
50 B_15 34974 non-null float64
51 S_11 35000 non-null float64
52 D_62 30170 non-null float64
53 D_63 35000 non-null object
54 D_64 33718 non-null object
55 D_65 35000 non-null float64
56 B_16 34994 non-null float64
57 B_17 15208 non-null float64
58 B_18 35000 non-null float64
59 B_19 34994 non-null float64
60 D_66 3888 non-null float64
61 B_20 34994 non-null float64
62 D_68 33738 non-null float64
63 S_12 35000 non-null float64
64 R_6 35000 non-null float64
65 S_13 35000 non-null float64
66 B_21 35000 non-null float64
67 D_69 33846 non-null float64
68 B_22 34994 non-null float64
69 D_70 34440 non-null float64
70 D_71 35000 non-null float64
71 D_72 34866 non-null float64
72 S_15 35000 non-null float64
73 B_23 35000 non-null float64
74 D_73 362 non-null float64
75 P_4 35000 non-null float64
76 D_74 34901 non-null float64
77 D_75 35000 non-null float64
78 D_76 4140 non-null float64
79 B_24 35000 non-null float64
80 R_7 35000 non-null float64
81 D_77 18900 non-null float64
82 B_25 34974 non-null float64
83 B_26 34994 non-null float64
84 D_78 33279 non-null float64
85 D_79 34582 non-null float64
86 R_8 35000 non-null float64
87 R_9 2087 non-null float64
88 S_16 35000 non-null float64
89 D_80 34901 non-null float64
90 R_10 35000 non-null float64
91 R_11 35000 non-null float64
92 B_27 34994 non-null float64
93 D_81 34844 non-null float64
94 D_82 9126 non-null float64
95 S_17 35000 non-null float64
96 R_12 35000 non-null float64
97 B_28 35000 non-null float64
98 R_13 35000 non-null float64
99 D_83 33846 non-null float64
100 R_14 35000 non-null float64
101 R_15 35000 non-null float64
102 D_84 34832 non-null float64
103 R_16 35000 non-null float64
104 B_29 2401 non-null float64
105 B_30 34994 non-null float64
106 S_18 35000 non-null float64
107 D_86 35000 non-null float64
108 D_87 35 non-null float64
109 R_17 35000 non-null float64
110 R_18 35000 non-null float64
111 D_88 45 non-null float64
112 B_31 35000 non-null int64
113 S_19 35000 non-null float64
114 R_19 35000 non-null float64
115 B_32 35000 non-null float64
116 S_20 35000 non-null float64
117 R_20 34999 non-null float64
118 R_21 35000 non-null float64
119 B_33 34994 non-null float64
120 D_89 34832 non-null float64
121 R_22 35000 non-null float64
122 R_23 35000 non-null float64
123 D_91 34066 non-null float64
124 D_92 35000 non-null float64
125 D_93 35000 non-null float64
126 D_94 35000 non-null float64
127 R_24 35000 non-null float64
128 R_25 35000 non-null float64
129 D_96 35000 non-null float64
130 S_22 34872 non-null float64
131 S_23 34998 non-null float64
132 S_24 34874 non-null float64
133 S_25 34917 non-null float64
134 S_26 34996 non-null float64
135 D_102 34751 non-null float64
136 D_103 34403 non-null float64
137 D_104 34403 non-null float64
138 D_105 15816 non-null float64
139 D_106 3771 non-null float64
140 D_107 34403 non-null float64
141 B_36 35000 non-null float64
142 B_37 35000 non-null float64
143 R_26 3717 non-null float64
144 R_27 34198 non-null float64
145 B_38 34994 non-null float64
146 D_108 202 non-null float64
147 D_109 34991 non-null float64
148 D_110 180 non-null float64
149 D_111 180 non-null float64
150 B_39 188 non-null float64
151 D_112 34990 non-null float64
152 B_40 35000 non-null float64
153 S_27 25742 non-null float64
154 D_113 33955 non-null float64
155 D_114 33955 non-null float64
156 D_115 33955 non-null float64
157 D_116 33955 non-null float64
158 D_117 33955 non-null float64
159 D_118 33955 non-null float64
160 D_119 33955 non-null float64
161 D_120 33955 non-null float64
162 D_121 33955 non-null float64
163 D_122 33955 non-null float64
164 D_123 33955 non-null float64
165 D_124 33955 non-null float64
166 D_125 33955 non-null float64
167 D_126 34310 non-null float64
168 D_127 35000 non-null float64
169 D_128 34403 non-null float64
170 D_129 34403 non-null float64
171 B_41 34996 non-null float64
172 B_42 498 non-null float64
173 D_130 34403 non-null float64
174 D_131 34403 non-null float64
175 D_132 3781 non-null float64
176 D_133 34739 non-null float64
177 R_28 35000 non-null float64
178 D_134 1317 non-null float64
179 D_135 1317 non-null float64
180 D_136 1317 non-null float64
181 D_137 1317 non-null float64
182 D_138 1317 non-null float64
183 D_139 34403 non-null float64
184 D_140 34751 non-null float64
185 D_141 34403 non-null float64
186 D_142 5945 non-null float64
187 D_143 34403 non-null float64
188 D_144 34751 non-null float64
189 D_145 34403 non-null float64
dtypes: float64(185), int64(1), object(4)
memory usage: 50.7+ MB
print(f'Number of data types object cols: {data.select_dtypes("object").columns.size}')
print(f'Number of data types int cols: {data.select_dtypes("int64").columns.size}')
print(f'Number of data types float cols: {data.select_dtypes("float64").columns.size}')Number of data types object cols: 4
Number of data types int cols: 1
Number of data types float cols: 185
Data preprocessing
customer_ID hurts your eyes!
When you execute the head method in the dataframe, the customer_ID column hurts your eyes. It’s a column that store strings that appears to be a hexadecimal. This type of string typically represents binary data in a human-readable format, using the characters 0-9 and a-f (or A-F) to represent the values 0 through 15 in each digit. Given its length, it is likely that this column is a hash of some sort. Given its length (64 hexadecimal characters), this string is likely a SHA-256 hash.
This mean that each row uses 64 bytes of memory. So following Chris Deotte’s advice here and here, we are going to take the last 16 digits of the hexadecimal string and convert it to an integer.
data['customer_ID'] = data['customer_ID'].apply(lambda x: int(x[-16:], 16)).astype('int32')From 11 categorical columns to 13 categorical columns
American Express has also provide a list of categorical features that we can convert to categorical type in Pandas. This will reduce the memory usage of these columns. The list is below:
['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
cat_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
data[cat_cols].head(5)| B_30 | B_38 | D_114 | D_116 | D_117 | D_120 | D_126 | D_63 | D_64 | D_66 | D_68 | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | 2.0 | 1.0 | 0.0 | 4.0 | 0.0 | 1.0 | CR | O | NaN | 6.0 |
| 1 | 0.0 | 2.0 | 1.0 | 0.0 | 4.0 | 0.0 | 1.0 | CR | O | NaN | 6.0 |
| 2 | 0.0 | 2.0 | 1.0 | 0.0 | 4.0 | 0.0 | 1.0 | CR | O | NaN | 6.0 |
| 3 | 0.0 | 2.0 | 1.0 | 0.0 | 4.0 | 0.0 | 1.0 | CR | O | NaN | 6.0 |
| 4 | 0.0 | 2.0 | 1.0 | 0.0 | 4.0 | 0.0 | 1.0 | CR | O | NaN | 6.0 |
data[cat_cols].nunique()B_30 3
B_38 7
D_114 2
D_116 2
D_117 7
D_120 2
D_126 3
D_63 6
D_64 4
D_66 2
D_68 7
dtype: int64
To convert a columns to categorical type, you can use the following code:
data[cat_cols] = data[cat_cols].astype('category')for cat in cat_cols:
data[cat] = data[cat].cat.codesAmong the numerical columns, we also discover that there are some columns that are categorical. We can convert them to categorical type in Pandas. This will reduce the memory usage of these columns
for col in data.select_dtypes(include='number'):
if data[col].nunique() < 3:
print('Columns with less than 2 unique values', data[col].name, data[col].dtype)Columns with less than 2 unique values D_87 float64
Columns with less than 2 unique values B_31 int64
data[['D_87', 'B_31']].value_counts()D_87 B_31
1.0 1 29
0 6
Name: count, dtype: int64
for col in ['D_87', 'B_31']:
data[col] = data[col].fillna(-1)
data[col] = data[col].astype('int8')Convert S_2 to a datetime data type
The S_2 column is a date column that is stored as a string. We can convert it to a datetime data type in Pandas using the following code:
data['S_2'] = pd.to_datetime(data['S_2'], format='%Y-%m-%d')This will reduce the memory usage of this column but we can go further and split the date into year, month, and day columns. This will allow us to analyze the data by year, month, and day, and will also reduce the memory usage even further.
# Split into separate columns
data['S_2_year'] = (data['S_2'].dt.year % 100).astype('int8')
data['S_2_month'] = data['S_2'].dt.month.astype('int8')
data['S_2_day'] = data['S_2'].dt.day.astype('int8')
del data['S_2']185 columns float64 to float32 or float16
The first thing you notice when you inspect the info() method is that we have 185 float64 columns which typically consume 64 bits, or 8 bytes per element. This type of data type is used to represent double precision floating point numbers, which are numbers that have very large or very small magnitude and/or require a significant number of digits of precision. As we are going to learn latter, we can reduce the memory usage by changing the data type of these columns to float32 and even float16.
len(data.select_dtypes(include=np.float64).columns)175
After all the transformation so far, we reduced the number to 175 columns. If we change the data type of these columns to float32, we will reduce the memory usage by half. If we change the data type to float16, we will reduce the memory usage by 75%.
So converting from float64 to float32 you can save 50% of memory usage without a significant loss in precision for many applications.
As Chris Deotte pointed out in the notebook previously mentioned, They discovered that we can convert these to float16 since the data has added uniform noise. This means that the data is not as precise as it seems. This can be also done for privacy protection.
for col in data.select_dtypes(include=np.float64).columns:
data[col] = data[col].fillna(-1).astype('float16')Chunk strategy
Once we’ve thoroughly understood our dataset and identified all necessary transformations to make it more manageable, the next step is to apply these transformations to the entire dataset. To avoid memory issues, we will process the data in chunks, similar to how batches are used in deep learning. Therefore, it’s essential to develop a strategy for determining the sizes of these chunks. We will calculate the memory consumption per row based on our original sample data, allowing us to optimize the chunk size effectively.
Memory consumption for 35,000 rows: 59.81 MB
data = pd.read_csv(PATH, nrows=35000)
data.memory_usage(deep=True).sum() / 1024**259.81640434265137
Memory usage per row is: \(\frac{59.81}{35000}=0.0017\) MB
(data.memory_usage(deep=True).sum() / 1024**2)/data.shape[0]0.0017090401240757534
In order to have enough memory we are going to consider to use half of the memory available.
Target memory usage aprox: \(\frac{8.26}{2}=4.15\) GB
available_memory_gb() / 2 4.147722244262695
Chunk size in rows: \(\frac{4.15}{0.0017}=2431508\) rows
(available_memory_gb() / 2) *1000 / ((data.memory_usage(deep=True).sum() / 1024**2)/data.shape[0])2431507.734935815
A more conservative approach would be to scale down this estimate by a factor to ensure smooth processing.
2431507.734935815 * 0.1243150.7734935815
Our final chunk size will be 243151 rows.
del data
gc.collect()1194
Introduction to PyArrow and Parquet
PyArrow is a Python library that provides a bridge between Python and the Apache Arrow in-memory data format.
Apache Arrow is designed to improve the performance and efficiency of data by providing a standardized in-memory columnar data format that can be shared between different systems and languages. PyArrow includes support for reading and writing Parquet files.
import pyarrow.parquet as pq -> Imports the PyArrow library’s Parquet module, which provides tools for reading and writing Parquet files
import pyarrow as pa -> Imports the PyArrow library to access data structures like Tables, which are used to interface with Parquet files and other Arrow functionalities.
Once the libraries have been imported, we need to define the path where the Parquet file is located. Also we need to initialize the writervariable to None that will be used to create a ParquetWriter object, which will be used to write the data to the Parquet file.
output_path = 'train.parquet'
writer = NoneBased on our estimation of memory usage, we set the chunksize to 243151. This is going to control how much data is read into memory at once. We can then iterate over the chunks of the DataFrame and write them to the Parquet file.
Next we can start to apply all the transformations to the data.
chunksize = 243151
for chunk in pd.read_csv(PATH, chunksize=chunksize, engine='c'):
# Transforming customer_ID
chunk['customer_ID'] = chunk['customer_ID'].apply(lambda x: int(x[-16:], 16)).astype('int32')
# Apply datetime transformations
chunk['S_2'] = pd.to_datetime(chunk['S_2'], format='%Y-%m-%d')
chunk['year_last_2_digits'] = chunk['S_2'].dt.year % 100
chunk['month'] = chunk['S_2'].dt.month
chunk['day'] = chunk['S_2'].dt.day
chunk[['year_last_2_digits', 'month', 'day']] = chunk[['year_last_2_digits', 'month', 'day']].astype('int8')
# Apply category transformations
chunk[cat_cols] = chunk[cat_cols].astype('category')
for cat in cat_cols:
chunk[cat] = chunk[cat].cat.codes.astype('int8')
# Additional columns to convert to 'int8'
for col in ['D_87', 'B_31']:
chunk[col] = chunk[col].fillna(-1).astype('int8')
# Convert float64 columns to float16 for all floating type columns
for col in chunk.select_dtypes(include=np.float64).columns:
chunk[col] = chunk[col].fillna(-1).astype('float16')Inside of the loop, we need to convert the the chunk dataframe into a PyArrow Table object using the pa.Table.from_pandas() method. This method takes the DataFrame as input and returns a Table object that can be written to a Parquet file.
# Convert DataFrame to PyArrow Table
table = pa.Table.from_pandas(chunk)Now we need to initialize the ParquetWriter object if it is not already initialized. We can do this by checking if the writer variable is None. If it is, we create a new ParquetWriter object using the pq.ParquetWriter() method. This method takes the output path and the schema of the data as input.
if writer is None:
writer = pq.ParquetWriter(output_path, table.schema, compression='snappy')Finally the next line of code, writes the PyArrow Table object to the Parquet file using the writer.write_table() method. This is done incrementally for each chunk of data read from the CSV file.
writer.write_table(table)After the loop is finished, we need to close the ParquetWriter object using the writer.close() method. This will finalize the writing process and save the data to the Parquet file.
if writer:
writer.close()This is a simple example of how to convert a large CSV file to a Parquet file using PyArrow. This process can be customized further based on the specific requirements of the data and the desired output format.
Running the whole code
Let’s run the whole code to get our final train set in Parquet format.
# Define the path for the output Parquet file
output_path = 'train.parquet'
# Initialize PyArrow Parquet writer, initially without a schema
writer = None
# Read the CSV file in chunks
chunksize = 243151 # Customized to your available memory and dataset size
# Cat columns
cat_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
for chunk in pd.read_csv(PATH, chunksize=chunksize, na_values=-1, engine='c'):
# Transforming customer_ID
chunk['customer_ID'] = chunk['customer_ID'].apply(lambda x: int(x[-16:], 16)).astype('int32')
# Apply datetime transformations
chunk['S_2'] = pd.to_datetime(chunk['S_2'], format='%Y-%m-%d')
chunk['year_last_2_digits'] = chunk['S_2'].dt.year % 100
chunk['month'] = chunk['S_2'].dt.month
chunk['day'] = chunk['S_2'].dt.day
chunk[['year_last_2_digits', 'month', 'day']] = chunk[['year_last_2_digits', 'month', 'day']].astype('int8')
# Apply category transformations
chunk[cat_cols] = chunk[cat_cols].astype('category')
for cat in cat_cols:
chunk[cat] = chunk[cat].cat.codes.astype('int8')
# Additional columns to convert to 'int8'
for col in ['D_87', 'B_31']:
chunk[col] = chunk[col].fillna(-1).astype('int8')
# Convert float64 columns to float16 for all floating type columns
for col in chunk.select_dtypes(include=np.float64).columns:
chunk[col] = chunk[col].fillna(-1).astype('float16')
# Convert DataFrame to PyArrow Table
table = pa.Table.from_pandas(chunk)
# Initialize writer with schema from the first chunk
if writer is None:
writer = pq.ParquetWriter(output_path, table.schema, compression='snappy')
# Write table to Parquet file
writer.write_table(table)
# Don't forget to close the writer to finalize the Parquet file
if writer:
writer.close()