271 lines
7.5 KiB
Python
271 lines
7.5 KiB
Python
|
#!/usr/bin/env python
|
||
|
|
||
|
"""
|
||
|
This module contains the Python 2 replacement for :mod:`csv`.
|
||
|
"""
|
||
|
|
||
|
import codecs
|
||
|
import csv
|
||
|
|
||
|
import six
|
||
|
|
||
|
from agate.exceptions import FieldSizeLimitError
|
||
|
|
||
|
EIGHT_BIT_ENCODINGS = [
|
||
|
'utf-8', 'u8', 'utf', 'utf8',
|
||
|
'latin-1', 'iso-8859-1', 'iso8859-1', '8859', 'cp819', 'latin', 'latin1', 'l1'
|
||
|
]
|
||
|
|
||
|
POSSIBLE_DELIMITERS = [',', '\t', ';', ' ', ':', '|']
|
||
|
|
||
|
|
||
|
class UTF8Recoder(six.Iterator):
|
||
|
"""
|
||
|
Iterator that reads an encoded stream and reencodes the input to UTF-8.
|
||
|
"""
|
||
|
def __init__(self, f, encoding):
|
||
|
self.reader = codecs.getreader(encoding)(f)
|
||
|
|
||
|
def __iter__(self):
|
||
|
return self
|
||
|
|
||
|
def __next__(self):
|
||
|
return next(self.reader).encode('utf-8')
|
||
|
|
||
|
|
||
|
class UnicodeReader(object):
|
||
|
"""
|
||
|
A CSV reader which will read rows from a file in a given encoding.
|
||
|
"""
|
||
|
def __init__(self, f, encoding='utf-8', field_size_limit=None, line_numbers=False, header=True, **kwargs):
|
||
|
self.line_numbers = line_numbers
|
||
|
self.header = header
|
||
|
|
||
|
f = UTF8Recoder(f, encoding)
|
||
|
|
||
|
self.reader = csv.reader(f, **kwargs)
|
||
|
|
||
|
if field_size_limit:
|
||
|
csv.field_size_limit(field_size_limit)
|
||
|
|
||
|
def next(self):
|
||
|
try:
|
||
|
row = next(self.reader)
|
||
|
except csv.Error as e:
|
||
|
# Terrible way to test for this exception, but there is no subclass
|
||
|
if 'field larger than field limit' in str(e):
|
||
|
raise FieldSizeLimitError(csv.field_size_limit())
|
||
|
else:
|
||
|
raise e
|
||
|
|
||
|
if self.line_numbers:
|
||
|
if self.header and self.line_num == 1:
|
||
|
row.insert(0, 'line_numbers')
|
||
|
else:
|
||
|
row.insert(0, str(self.line_num - 1 if self.header else self.line_num))
|
||
|
|
||
|
return [six.text_type(s, 'utf-8') for s in row]
|
||
|
|
||
|
def __iter__(self):
|
||
|
return self
|
||
|
|
||
|
@property
|
||
|
def dialect(self):
|
||
|
return self.reader.dialect
|
||
|
|
||
|
@property
|
||
|
def line_num(self):
|
||
|
return self.reader.line_num
|
||
|
|
||
|
|
||
|
class UnicodeWriter(object):
|
||
|
"""
|
||
|
A CSV writer which will write rows to a file in the specified encoding.
|
||
|
|
||
|
NB: Optimized so that eight-bit encodings skip re-encoding. See:
|
||
|
https://github.com/onyxfish/csvkit/issues/175
|
||
|
"""
|
||
|
def __init__(self, f, encoding='utf-8', **kwargs):
|
||
|
self.encoding = encoding
|
||
|
self._eight_bit = (self.encoding.lower().replace('_', '-') in EIGHT_BIT_ENCODINGS)
|
||
|
|
||
|
if self._eight_bit:
|
||
|
self.writer = csv.writer(f, **kwargs)
|
||
|
else:
|
||
|
# Redirect output to a queue for reencoding
|
||
|
self.queue = six.StringIO()
|
||
|
self.writer = csv.writer(self.queue, **kwargs)
|
||
|
self.stream = f
|
||
|
self.encoder = codecs.getincrementalencoder(encoding)()
|
||
|
|
||
|
def writerow(self, row):
|
||
|
if self._eight_bit:
|
||
|
self.writer.writerow([six.text_type(s if s is not None else '').encode(self.encoding) for s in row])
|
||
|
else:
|
||
|
self.writer.writerow([six.text_type(s if s is not None else '').encode('utf-8') for s in row])
|
||
|
# Fetch UTF-8 output from the queue...
|
||
|
data = self.queue.getvalue()
|
||
|
data = data.decode('utf-8')
|
||
|
# ...and reencode it into the target encoding
|
||
|
data = self.encoder.encode(data)
|
||
|
# write to the file
|
||
|
self.stream.write(data)
|
||
|
# empty the queue
|
||
|
self.queue.truncate(0)
|
||
|
|
||
|
def writerows(self, rows):
|
||
|
for row in rows:
|
||
|
self.writerow(row)
|
||
|
|
||
|
|
||
|
class UnicodeDictReader(csv.DictReader):
|
||
|
"""
|
||
|
Defer almost all implementation to :class:`csv.DictReader`, but wraps our
|
||
|
unicode reader instead of :func:`csv.reader`.
|
||
|
"""
|
||
|
def __init__(self, f, fieldnames=None, restkey=None, restval=None, *args, **kwargs):
|
||
|
reader = UnicodeReader(f, *args, **kwargs)
|
||
|
|
||
|
if 'encoding' in kwargs:
|
||
|
kwargs.pop('encoding')
|
||
|
|
||
|
csv.DictReader.__init__(self, f, fieldnames, restkey, restval, *args, **kwargs)
|
||
|
|
||
|
self.reader = reader
|
||
|
|
||
|
|
||
|
class UnicodeDictWriter(csv.DictWriter):
|
||
|
"""
|
||
|
Defer almost all implementation to :class:`csv.DictWriter`, but wraps our
|
||
|
unicode writer instead of :func:`csv.writer`.
|
||
|
"""
|
||
|
def __init__(self, f, fieldnames, restval='', extrasaction='raise', *args, **kwds):
|
||
|
self.fieldnames = fieldnames
|
||
|
self.restval = restval
|
||
|
|
||
|
if extrasaction.lower() not in ('raise', 'ignore'):
|
||
|
raise ValueError('extrasaction (%s) must be "raise" or "ignore"' % extrasaction)
|
||
|
|
||
|
self.extrasaction = extrasaction
|
||
|
|
||
|
self.writer = UnicodeWriter(f, *args, **kwds)
|
||
|
|
||
|
|
||
|
class Reader(UnicodeReader):
|
||
|
"""
|
||
|
A unicode-aware CSV reader.
|
||
|
"""
|
||
|
pass
|
||
|
|
||
|
|
||
|
class Writer(UnicodeWriter):
|
||
|
"""
|
||
|
A unicode-aware CSV writer.
|
||
|
"""
|
||
|
def __init__(self, f, encoding='utf-8', line_numbers=False, **kwargs):
|
||
|
self.row_count = 0
|
||
|
self.line_numbers = line_numbers
|
||
|
|
||
|
if 'lineterminator' not in kwargs:
|
||
|
kwargs['lineterminator'] = '\n'
|
||
|
|
||
|
UnicodeWriter.__init__(self, f, encoding, **kwargs)
|
||
|
|
||
|
def _append_line_number(self, row):
|
||
|
if self.row_count == 0:
|
||
|
row.insert(0, 'line_number')
|
||
|
else:
|
||
|
row.insert(0, self.row_count)
|
||
|
|
||
|
self.row_count += 1
|
||
|
|
||
|
def writerow(self, row):
|
||
|
if self.line_numbers:
|
||
|
row = list(row)
|
||
|
self._append_line_number(row)
|
||
|
|
||
|
# Convert embedded Mac line endings to unix style line endings so they get quoted
|
||
|
row = [i.replace('\r', '\n') if isinstance(i, six.string_types) else i for i in row]
|
||
|
|
||
|
UnicodeWriter.writerow(self, row)
|
||
|
|
||
|
def writerows(self, rows):
|
||
|
for row in rows:
|
||
|
self.writerow(row)
|
||
|
|
||
|
|
||
|
class DictReader(UnicodeDictReader):
|
||
|
"""
|
||
|
A unicode-aware CSV DictReader.
|
||
|
"""
|
||
|
pass
|
||
|
|
||
|
|
||
|
class DictWriter(UnicodeDictWriter):
|
||
|
"""
|
||
|
A unicode-aware CSV DictWriter.
|
||
|
"""
|
||
|
def __init__(self, f, fieldnames, encoding='utf-8', line_numbers=False, **kwargs):
|
||
|
self.row_count = 0
|
||
|
self.line_numbers = line_numbers
|
||
|
|
||
|
if 'lineterminator' not in kwargs:
|
||
|
kwargs['lineterminator'] = '\n'
|
||
|
|
||
|
UnicodeDictWriter.__init__(self, f, fieldnames, encoding=encoding, **kwargs)
|
||
|
|
||
|
def _append_line_number(self, row):
|
||
|
if self.row_count == 0:
|
||
|
row['line_number'] = 0
|
||
|
else:
|
||
|
row['line_number'] = self.row_count
|
||
|
|
||
|
self.row_count += 1
|
||
|
|
||
|
def writerow(self, row):
|
||
|
if self.line_numbers:
|
||
|
row = list(row)
|
||
|
self._append_line_number(row)
|
||
|
|
||
|
# Convert embedded Mac line endings to unix style line endings so they get quoted
|
||
|
row = dict([(k, v.replace('\r', '\n')) if isinstance(v, basestring) else (k, v) for k, v in row.items()])
|
||
|
|
||
|
UnicodeDictWriter.writerow(self, row)
|
||
|
|
||
|
def writerows(self, rows):
|
||
|
for row in rows:
|
||
|
self.writerow(row)
|
||
|
|
||
|
|
||
|
class Sniffer(object):
|
||
|
"""
|
||
|
A functinonal wrapper of ``csv.Sniffer()``.
|
||
|
"""
|
||
|
def sniff(self, sample):
|
||
|
"""
|
||
|
A functional version of ``csv.Sniffer().sniff``, that extends the
|
||
|
list of possible delimiters to include some seen in the wild.
|
||
|
"""
|
||
|
try:
|
||
|
dialect = csv.Sniffer().sniff(sample, POSSIBLE_DELIMITERS)
|
||
|
except:
|
||
|
dialect = None
|
||
|
|
||
|
return dialect
|
||
|
|
||
|
|
||
|
def reader(*args, **kwargs):
|
||
|
"""
|
||
|
A replacement for Python's :func:`csv.reader` that uses
|
||
|
:class:`.csv_py2.Reader`.
|
||
|
"""
|
||
|
return Reader(*args, **kwargs)
|
||
|
|
||
|
|
||
|
def writer(*args, **kwargs):
|
||
|
"""
|
||
|
A replacement for Python's :func:`csv.writer` that uses
|
||
|
:class:`.csv_py2.Writer`.
|
||
|
"""
|
||
|
return Writer(*args, **kwargs)
|