Default repository for chkcsv.py.
Revisão | 0bcecc677719af348eea7a9b4da4d9b9e845859e (tree) |
---|---|
Hora | 2018-10-28 05:44:49 |
Autor | Dreas Nielsen <dreas.nielsen@gmai...> |
Commiter | Dreas Nielsen |
Modified to run under Python 3 as well as 2, and factored the code for easier use as a library module.
@@ -2,6 +2,8 @@ | ||
2 | 2 | MANIFEST |
3 | 3 | chkcsv.htm |
4 | 4 | .pypirc |
5 | +.*~ | |
6 | +.*\.pyc | |
5 | 7 | dist/* |
6 | 8 | doc/build/* |
7 | 9 | test/* |
@@ -0,0 +1,1 @@ | ||
1 | +from .chkcsv import * |
@@ -20,7 +20,7 @@ | ||
20 | 20 | # though a section name for them is reserved. |
21 | 21 | # |
22 | 22 | # COPYRIGHT: |
23 | -# Copyright (c) 2011, R.Dreas Nielsen (RDN) | |
23 | +# Copyright (c) 2011,2018 R.Dreas Nielsen (RDN) | |
24 | 24 | # |
25 | 25 | # LICENSE: |
26 | 26 | # GPL v.3 |
@@ -37,14 +37,20 @@ | ||
37 | 37 | # Date Remarks |
38 | 38 | # ---------- -------------------------------------------------------------- |
39 | 39 | # 2011-09-25 First version. Version 0.8.0.0. RDN. |
40 | +# 2018-10-27 Converted to run under both Python 2 and 3. Version 1.0.0. RDN. | |
40 | 41 | # ============================================================================ |
41 | 42 | |
42 | -_version = "0.8.0.0" | |
43 | -_vdate = "2011-09-24" | |
43 | +_version = "1.0.0" | |
44 | +_vdate = "2018-10-27" | |
44 | 45 | |
45 | 46 | import sys |
46 | 47 | from optparse import OptionParser |
47 | -import ConfigParser | |
48 | +try: | |
49 | + # Py2 | |
50 | + from ConfigParser import SafeConfigParser as ConfigParser | |
51 | +except: | |
52 | + # Py3 | |
53 | + from configparser import ConfigParser | |
48 | 54 | import codecs |
49 | 55 | import os.path |
50 | 56 | import csv |
@@ -71,15 +77,23 @@ | ||
71 | 77 | self.column = column |
72 | 78 | |
73 | 79 | class CsvChecker(): |
74 | - """Object to check a specific column of a defined type. After initialization, the 'check()' | |
75 | - method will return a boolean indicating whether a data value is acceptable.""" | |
80 | + """Create an object to check a specific column of a defined type. | |
81 | + | |
82 | + :param fmt_spec: A ConfigParser object. | |
83 | + :param colname: The name of the data column. | |
84 | + :param column_required_default: A Boolean indicating whether the column is required by default. | |
85 | + :param data_required_default: A Boolean indicating whether data values are required (non-null) by default. | |
86 | + | |
87 | + After initialization, the 'check()' | |
88 | + method will return a boolean indicating whether a data value is acceptable. | |
89 | + """ | |
76 | 90 | get_fn = { |
77 | - 'column_required' : ConfigParser.SafeConfigParser.getboolean, | |
78 | - 'data_required' : ConfigParser.SafeConfigParser.getboolean, | |
79 | - 'type' : ConfigParser.SafeConfigParser.get, | |
80 | - 'minlen' : ConfigParser.SafeConfigParser.getint, | |
81 | - 'maxlen' : ConfigParser.SafeConfigParser.getint, | |
82 | - 'pattern' : ConfigParser.SafeConfigParser.get | |
91 | + 'column_required' : ConfigParser.getboolean, | |
92 | + 'data_required' : ConfigParser.getboolean, | |
93 | + 'type' : ConfigParser.get, | |
94 | + 'minlen' : ConfigParser.getint, | |
95 | + 'maxlen' : ConfigParser.getint, | |
96 | + 'pattern' : ConfigParser.get | |
83 | 97 | } |
84 | 98 | datetime_fmts = ("%x", |
85 | 99 | "%c", |
@@ -214,7 +228,7 @@ | ||
214 | 228 | return None |
215 | 229 | if type(data) == type(datetime.date.today()): |
216 | 230 | return None |
217 | - if type(data) != types.StringType: | |
231 | + if type(data) != type(""): | |
218 | 232 | if data==None: |
219 | 233 | return "missing date/time" |
220 | 234 | try: |
@@ -235,7 +249,7 @@ | ||
235 | 249 | return None |
236 | 250 | if type(data) == type(datetime.date.today()): |
237 | 251 | return None |
238 | - if type(data) != types.StringType: | |
252 | + if type(data) != type(""): | |
239 | 253 | if data==None: |
240 | 254 | return "missing date" |
241 | 255 | try: |
@@ -354,49 +368,76 @@ | ||
354 | 368 | self.reader = codecs.getreader(encoding)(f) |
355 | 369 | def __iter__(self): |
356 | 370 | return self |
357 | - def next(self): | |
358 | - return self.reader.next().encode('utf-8') | |
371 | + def __next__(self): | |
372 | + return next(self.reader).encode('utf-8') | |
359 | 373 | |
360 | 374 | class UnicodeReader: |
361 | 375 | """A CSV reader which will iterate over lines in the CSV file "f", |
362 | - which is encoded in the given encoding.""" | |
376 | + which is encoded in the given encoding. | |
377 | + """ | |
363 | 378 | def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): |
364 | 379 | f = UTF8Recoder(f, encoding) |
365 | 380 | self.reader = csv.reader(f, dialect=dialect, **kwds) |
366 | - def next(self): | |
367 | - row = self.reader.next() | |
381 | + def __next__(self): | |
382 | + row = next(self.reader) | |
368 | 383 | return [unicode(s, "utf-8") for s in row] |
369 | 384 | def __iter__(self): |
370 | 385 | return self |
371 | 386 | |
372 | 387 | def show_errors(errlist): |
373 | - """Items in errlist are a tuple of a narrative message, the name of the file | |
374 | - in which the error occurred, the line number of the file, and the column | |
375 | - name of the file. All but the first may be null.""" | |
388 | + """Write a list of error messages to stderr. | |
389 | + | |
390 | + :param errlist: A tuple of a narrative message, the name of the file | |
391 | + in which the error occurred, the line number of the file, and the column | |
392 | + name of the file. All but the first may be null. | |
393 | + """ | |
376 | 394 | for err in errlist: |
377 | 395 | sys.stderr.write("%s.\n" % " ".join([ "%s %s" % em for em in [ e for e in |
378 | 396 | zip(("Error:", "in file", "on line", "in column"), err) if e[1]]])) |
379 | 397 | |
380 | 398 | |
399 | +def read_format_specs(fmt_file, column_required, data_required, chkopts="chkcsvoptions"): | |
400 | + """Read format specifications from a file. | |
401 | + | |
402 | + :param fmt_file: The name of the file containing format specifications. | |
403 | + :param column_required: Whether or not the column must be in the CSV file to be checked. | |
404 | + :param data_required: Whether or not a data value is required on every row of the CSV file. | |
405 | + :param chkopts: The name of a section in the format specification file containing additional options. | |
406 | + """ | |
407 | + fmtspecs = ConfigParser() | |
408 | + try: | |
409 | + files_read = fmtspecs.read([fmt_file]) | |
410 | + except configparser.Error: | |
411 | + raise ChkCsvError("Error reading format specification file.", fmt_file) | |
412 | + if len(files_read) == 0: | |
413 | + raise ChkCsvError("Error reading format specification file.", fmt_file) | |
414 | + # Convert ConfigParser object into a list of CsvChecker objects | |
415 | + speccols = [ sect for sect in fmtspecs.sections() if sect != chkopts ] | |
416 | + cols = {} | |
417 | + for col in speccols: | |
418 | + cols[col] = CsvChecker(fmtspecs, col, column_required, data_required) | |
419 | + return cols | |
420 | + | |
421 | + | |
381 | 422 | def check_csv_file(csv_fname, cols, halt_on_err, columnexit, \ |
382 | 423 | linelength, caseinsensitive, encoding=None): |
383 | 424 | """Check that all of the required columns and data are present in the CSV file, and that |
384 | - the data conform to the appropriate type and other specification. | |
385 | - Arguments are: 1. The name of the CSV file to check; 2. A dictionary of | |
386 | - specifications (ChkCsv objects) indexed by column name; 3. Whether to exit | |
387 | - on the first error; 4. Whether to exit if the CSV file doesn't have | |
388 | - exactly the same columns in the format specifications; 5. Whether to | |
389 | - report an error if any data row has a different number of items than indicated | |
390 | - by the column headers; 6. Whether column names in the specifications and | |
391 | - CSV file should be compared case-insensitive; 7. The character encoding of | |
392 | - the CSV file. | |
425 | + the data conform to the appropriate type and other specifications. | |
426 | + | |
427 | + :param csv_fname: The name of the CSV file to check. | |
428 | + :param cols: A dictionary of specifications (CsvChecker objects) indexed by column name. | |
429 | + :param halt_on_err: Whether to exit on the first error. | |
430 | + :param columnexit: Whether to exit if the CSV file doesn't have exactly the same columns in the format specifications. | |
431 | + :param linelength: Whether to report an error if any data row has a different number of items than indicated by the column headers. | |
432 | + :param casesensitive: Whether column names in the specifications and CSV file should be compared case-insensitively. | |
433 | + :param encoding: The character encoding of the CSV file. | |
393 | 434 | """ |
394 | 435 | dialect = csv.Sniffer().sniff(open(csv_fname, "rt").readline()) |
395 | 436 | if encoding: |
396 | 437 | inf = UnicodeReader(open(csv_fname, "rt"), dialect, encoding) |
397 | 438 | else: |
398 | 439 | inf = csv.reader(open(csv_fname, "rt"), dialect) |
399 | - colnames = inf.next() | |
440 | + colnames = next(inf) | |
400 | 441 | req_cols = [ c for c in cols if cols[c].column_required ] |
401 | 442 | # Exit if all required columns are not present |
402 | 443 | if caseinsensitive: |
@@ -462,7 +503,7 @@ | ||
462 | 503 | if len(args)==0: |
463 | 504 | parser.print_help() |
464 | 505 | return 0 |
465 | - if len(args) <> 1: | |
506 | + if len(args) != 1: | |
466 | 507 | raise ChkCsvError("A single argument, the name of the CSV file to check, must be provided.") |
467 | 508 | csv_file = args[0] |
468 | 509 | if not os.path.exists(csv_file): |
@@ -474,22 +515,12 @@ | ||
474 | 515 | fmt_file = "%s.fmt" % fn |
475 | 516 | if not os.path.exists(fmt_file): |
476 | 517 | raise ChkCsvError("The format file does not exist.", fmt_file) |
477 | - fmtspecs = ConfigParser.SafeConfigParser() | |
478 | - try: | |
479 | - files_read = fmtspecs.read([fmt_file]) | |
480 | - except ConfigParser.Error: | |
481 | - raise ChkCsvError("Error reading format specification file.", fmt_file) | |
482 | - if len(files_read) == 0: | |
483 | - raise ChkCsvError("Error reading format specification file.", fmt_file) | |
518 | + # Get format specifications as a list of ChkCsv objects from the configuration file. | |
484 | 519 | if opts.optsection: |
485 | 520 | chkopts = opts.optsection |
486 | 521 | else: |
487 | 522 | chkopts = "chkcsvoptions" |
488 | - # Convert ConfigParser object into a list of CsvChecker objects | |
489 | - speccols = [ sect for sect in fmtspecs.sections() if sect <> chkopts ] | |
490 | - cols = {} | |
491 | - for col in speccols: | |
492 | - cols[col] = CsvChecker(fmtspecs, col, opts.column_required, opts.data_required) | |
523 | + cols = read_format_specs(fmt_file, opts.column_required, opts.data_required, chkopts) | |
493 | 524 | # Check the file |
494 | 525 | errorlist = check_csv_file(csv_file, cols, opts.haltonerror, |
495 | 526 | opts.columnexit, opts.linelength, opts.caseinsensitive, opts.encoding) |
@@ -503,10 +534,10 @@ | ||
503 | 534 | if __name__=='__main__': |
504 | 535 | try: |
505 | 536 | status = main() |
506 | - except ChkCsvError, msg: | |
537 | + except ChkCsvError as msg: | |
507 | 538 | show_errors( [ (msg.errmsg, msg.infile, msg.line, msg.column) ] ) |
508 | 539 | exit(1) |
509 | - except SystemExit, x: | |
540 | + except SystemExit as x: | |
510 | 541 | sys.exit(x) |
511 | 542 | except Exception: |
512 | 543 | strace = traceback.extract_tb(sys.exc_info()[2])[-1:] |
@@ -16,9 +16,10 @@ | ||
16 | 16 | # add these directories to sys.path here. If the directory is relative to the |
17 | 17 | # documentation root, use os.path.abspath to make it absolute, like shown here. |
18 | 18 | # |
19 | -# import os | |
20 | -# import sys | |
19 | +import os | |
20 | +import sys | |
21 | 21 | # sys.path.insert(0, os.path.abspath('.')) |
22 | +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__name__), '../../chkcsv'))) | |
22 | 23 | |
23 | 24 | |
24 | 25 | # -- General configuration ------------------------------------------------ |
@@ -55,9 +56,9 @@ | ||
55 | 56 | # built documents. |
56 | 57 | # |
57 | 58 | # The short X.Y version. |
58 | -version = u'0.8' | |
59 | +version = u'1.0' | |
59 | 60 | # The full version, including alpha/beta/rc tags. |
60 | -release = u'0.8.0' | |
61 | +release = u'1.0.0' | |
61 | 62 | |
62 | 63 | # The language for content autogenerated by Sphinx. Refer to documentation |
63 | 64 | # for a list of supported languages. |
@@ -81,6 +81,24 @@ | ||
81 | 81 | -x, --exitonerror Exit when the first error is found. |
82 | 82 | |
83 | 83 | |
84 | +.. _availability: | |
85 | + | |
86 | +Installation | |
87 | +================================ | |
88 | + | |
89 | + | |
90 | +The chkcsv program is available on `PyPi <https://pypi.org/project/chkcsv/>`_. | |
91 | +It can be installed with: | |
92 | + | |
93 | +.. code-block:: none | |
94 | + | |
95 | + pip install chkcsv | |
96 | + | |
97 | +By default, chkcsv.py will be installed as an executable script. | |
98 | +To use it as a library module, it must be copied to | |
99 | +Python's *site-packages* directory. | |
100 | + | |
101 | + | |
84 | 102 | Format Specifications |
85 | 103 | ============================ |
86 | 104 |
@@ -284,24 +302,37 @@ | ||
284 | 302 | pattern=(?i)(FT|M|CM)$ |
285 | 303 | |
286 | 304 | |
287 | -.. _availability: | |
288 | - | |
289 | -Availability | |
305 | +Module Documentation | |
290 | 306 | ================================ |
291 | 307 | |
308 | +To use chkcsv.py as a library module, it must be placed in either the | |
309 | +global or personal *site-packages* directory. The module provides the | |
310 | +following classes and functions to support reading of format specifications | |
311 | +from a file and checking a CSV file. | |
292 | 312 | |
293 | -The chkcsv program is available on `PyPi <https://pypi.org/project/chkcsv/>`_. | |
294 | -It can be installed with: | |
313 | +Classes | |
314 | +-------------------------------- | |
295 | 315 | |
296 | -.. code-block:: none | |
316 | +.. module:: chkcsv | |
317 | +.. autoclass:: CsvChecker | |
318 | + :members: | |
319 | + | |
297 | 320 | |
298 | - pip install chkcsv | |
321 | +Functions | |
322 | +------------------------------- | |
323 | + | |
324 | +.. autofunction:: read_format_specs | |
325 | + | |
326 | +.. autofunction:: check_csv_file | |
327 | + | |
328 | +.. autofunction:: show_errors | |
329 | + | |
299 | 330 | |
300 | 331 | |
301 | 332 | Copyright and License |
302 | 333 | ================================ |
303 | 334 | |
304 | -Copyright (c) 2011, R.Dreas Nielsen | |
335 | +Copyright (c) 2011, 2018, R.Dreas Nielsen | |
305 | 336 | |
306 | 337 | This program is free software: you can redistribute it and/or modify it |
307 | 338 | under the terms of the GNU General Public License as published by the |
@@ -1,21 +1,28 @@ | ||
1 | 1 | from distutils.core import setup |
2 | 2 | |
3 | 3 | setup(name='chkcsv', |
4 | - version='0.8.0.2', | |
5 | - description="Check the format of a CSV file", | |
4 | + version='1.0.0', | |
5 | + description="Checks the format of a CSV file with respect to a specifed set of column names and types.", | |
6 | 6 | author='Dreas Nielsen', |
7 | 7 | author_email='dreas.nielsen@gmail.com', |
8 | 8 | url='https://bitbucket.org/rdnielsen/chkcsv/', |
9 | 9 | scripts=['chkcsv/chkcsv.py'], |
10 | + requires=[], | |
11 | + python_requires = '>=2.7', | |
10 | 12 | classifiers=[ |
11 | 13 | 'Development Status :: 5 - Production/Stable', |
12 | 14 | 'Environment :: Console', |
13 | 15 | 'Intended Audience :: End Users/Desktop', |
14 | 16 | 'License :: OSI Approved :: GNU General Public License (GPL)', |
15 | 17 | 'Natural Language :: English', |
18 | + 'Programming Language :: Python :: 3', | |
19 | + 'Programming Language :: Python :: 2.7', | |
16 | 20 | 'Operating System :: OS Independent', |
17 | 21 | 'Topic :: Text Processing :: General', |
18 | - 'Topic :: Office/Business' | |
22 | + 'Topic :: Office/Business', | |
23 | + 'Topic :: Scientific/Engineering', | |
24 | + 'Topic :: Text Processing', | |
25 | + 'Topic :: Utilities' | |
19 | 26 | ], |
20 | 27 | long_description="""``chkcsv.py`` is a Python module and program |
21 | 28 | that checks the format of data in a CSV file. It can check whether required |