From 1c20ff132915f48f3c0faad86204f0487cba690f Mon Sep 17 00:00:00 2001 From: scoobybejesus Date: Sat, 14 Nov 2020 17:33:29 -0500 Subject: [PATCH] Add helper script for sanitizing CSV input file. --- clean_input_csv.py | 161 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 161 insertions(+) create mode 100755 clean_input_csv.py diff --git a/clean_input_csv.py b/clean_input_csv.py new file mode 100755 index 0000000..db7e8a4 --- /dev/null +++ b/clean_input_csv.py @@ -0,0 +1,161 @@ +#!/usr/bin/python3 + +## Purpose: Allows user to keep additional data in their CSV Input File to increase its usefulness and +## enhance readability, yet be able to properly format it prior to importing into `cryptools`. +## e.g.: +## -Keep an additional first column for flagging/noting important transactions +## -Keep additional columns for tracking a running balance +## -Rows beneath transactions for life-to-date totals and other calculations and notes +## -Ability to use number formatting with parenthesis for negative numbers and commas +## -This script will change (1,000.00) to 1000.00 +## +## If a column doesn't have a header, this script will exclude it from the sanitized output. +## Similarly, this script will exclude transaction rows missing data in either of the first +## two fields of the row. + +## Usage: + +# 1. Export/Save crypto activity as csv +# 2. Move the csv file to your desired directory +# 3. Rename file to .csv (see variable below) +# 4. Build/run this file in an editor or on command line (from same directory), creating the input file +# 5. Import the input file into cryptools + +import csv +import re +import os + +unedited = "DigiTrnx.csv" # To be replaced with a launch arg, presumably + +stage1 = "stage1.csv" + +## First, writes all header rows. Then attempts to write all transaction rows. +## In the transaction rows, if it finds blank/empty transaction date or proceeds fields, +## it discards the row. + +## This allows notes/sums/calculations/etc under the transaction rows to be discarded + +with open(unedited) as fin, open(stage1, 'a') as fout: + rdr = csv.reader(fin) + wtr = csv.writer(fout) + header = next(rdr) + header2 = next(rdr) + header3 = next(rdr) + header4 = next(rdr) + wtr.writerow(header) + wtr.writerow(header2) + wtr.writerow(header3) + wtr.writerow(header4) + + for row in rdr: + if row[0] == "" or row[1] == "": + pass + else: + wtr.writerow(row) + +stage2 = "stage2.csv" + +## Iterates over the fields in the first header row to search for empty/blank cells. +## Keeps a list of every column index that does contain data, and disregards all the +## indices for columns with a blank. + +## Using the indicies of valid columns, writes a new CSV file using only valid columns. + +## This is useful when the input file is also used to manually keep a running tally or +## columns with additional notes, but which must be discarded to prepare a proper +## CSV input file. + +with open(stage1) as fin, open(stage2, 'a') as fout: + rdr = csv.reader(fin) + wtr = csv.writer(fout) + header = next(rdr) + header2 = next(rdr) + header3 = next(rdr) + header4 = next(rdr) + + colListKept = [] + + for col in header: + if col == "": + pass + else: + colListKept.append(header.index(col)) + + output = [v for (i,v) in enumerate(header) if i in colListKept] + wtr.writerow(output) + + output = [v for (i,v) in enumerate(header2) if i in colListKept] + wtr.writerow(output) + + output = [v for (i,v) in enumerate(header3) if i in colListKept] + wtr.writerow(output) + + output = [v for (i,v) in enumerate(header4) if i in colListKept] + wtr.writerow(output) + + for row in rdr: + output = [v for (i,v) in enumerate(row) if i in colListKept] + wtr.writerow(output) + + +stage3 = "InputFile-pycleaned.csv" + +## Performs final formatting changes to ensure values can be successfully parsed. +## Numbers must have commas removed. Negative numbers must have parentheses replaced +## with a minus sign. Could also be used to substitute the date separation character. + +## i.e., (1.01) -> -1.01 (1,000.00) -> -1000.00 + +with open(stage2) as fin, open(stage3, 'w', newline='') as fout: + + rdr = csv.reader(fin, quoting=csv.QUOTE_ALL) + wtr = csv.writer(fout) + + header = next(rdr) + header2 = next(rdr) + header3 = next(rdr) + header4 = next(rdr) + wtr.writerow(header) + wtr.writerow(header2) + wtr.writerow(header3) + wtr.writerow(header4) + + for row in rdr: + listRow = [] + for field in row: + fieldStr = str(field) # cast as string, just so there's no funny business + try: + # Handles negative numbers + if fieldStr[0] == "(": + fieldStr = fieldStr.replace('(','-').replace(')', '').replace(',', '') + listRow.append(fieldStr) + continue + + # Uncomment the below and modify as necessary if you want to change date formatting + # elif re.search(r'\d\d-\d\d-\d\d',fieldStr):# Find dates and change formatting + # fieldStr = fieldStr.replace('-', '/') + # listRow.append(fieldStr) + # continue + + # Handle commas in remaining fields + else: + try: + # if you remove commas from a string and are able to convert to float... + fieldStr_test = fieldStr.replace(',', '') + fieldStr_float = float(fieldStr_test) + # then it is definitely a positive number, so remove the comma. + fieldStr = fieldStr.replace(',', '') + listRow.append(fieldStr) + continue + except: # If the 'try' block fails, it's a memo, not a number, so leave any commas + listRow.append(fieldStr) + continue + except: # If the `try` block fails, it's a blank/empty string + listRow.append(fieldStr) + continue + wtr.writerow(listRow) + +os.remove(stage1) +os.remove(stage2) + +print("Input file ready") \ No newline at end of file