Skip to content

Commit 2211f2f

Browse files
authored
Merge pull request #178 from adobe-apiplatform/issue-167
Fix #167: allow non-ascii unicode chars in user and group names. Also fix #159 and fix #173, both for the second time :(.
2 parents 9088d63 + 3b09719 commit 2211f2f

File tree

10 files changed

+78
-35
lines changed

10 files changed

+78
-35
lines changed

examples/config files - basic/3 connector-ldap.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,14 @@ all_users_filter: "(&(objectClass=user)(objectCategory=person)(!(userAccountCont
6565
# or this one for OpenLDAP: "(&(|(objectClass=groupOfNames)(objectClass=posixGroup))(cn={group}))"
6666
group_filter_format: "(&(|(objectCategory=group)(objectClass=groupOfNames)(objectClass=posixGroup))(cn={group}))"
6767

68+
# (optional) string_encoding (default value given below)
69+
# string_encoding specifies the Unicode string encoding used by the directory.
70+
# All values retrieved from the directory are converted to Unicode before being
71+
# sent to or compared with values on the Adobe side, to avoid encoding issues.
72+
# The value must be a Python codec name or alias, such as 'latin1' or 'utf-8.
73+
# See https://docs.python.org/2/library/codecs.html#standard-encodings for details.
74+
#string_encoding: utf-8
75+
6876
# (optional) user_identity_type_format (no default)
6977
# user_identity_type_format specifies how to construct a user's desired identity
7078
# type on the Adobe side by combining constant strings with attribute values.
@@ -86,6 +94,8 @@ group_filter_format: "(&(|(objectCategory=group)(objectClass=groupOfNames)(objec
8694
# The default value used here is simple, and suitable for OpenLDAP systems. If you
8795
# are using a non-email-aware AD system, which holds the username separately
8896
# from the domain name, you may want: "{sAMAccountName}@mydomain.com"
97+
# NOTE: for this and every format setting, the constant strings must be in
98+
# the encoding specified by the string_encoding setting, above.
8999
user_email_format: "{mail}"
90100

91101
# (optional) user_domain_format (no default value)

examples/config files - basic/4 connector-csv.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,14 @@
2020
# To set it to a specific value, uncomment this setting:
2121
#delimiter: ","
2222

23+
# (optional) string_encoding (default value given below)
24+
# string_encoding specifies the Unicode string encoding used in the CSV file.
25+
# All values retrieved from the file are converted to Unicode before being
26+
# sent to or compared with values on the Adobe side, to avoid encoding issues.
27+
# The value must be a Python codec name or alias, such as 'latin1' or 'utf-8.
28+
# See https://docs.python.org/2/library/codecs.html#standard-encodings for details.
29+
#string_encoding: utf-8
30+
2331
# (optional) email_column_name (default "email")
2432
# The column name that contains the user's email address.
2533
# Values in this column must be valid, unquoted email addresses.

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
'pycrypto',
4444
'python-ldap==2.4.25',
4545
'PyYAML',
46-
'umapi-client>=2.3',
46+
'umapi-client>=2.4.1',
4747
'psutil',
4848
'keyring'
4949
],

user_sync/app.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,9 @@
2929
import user_sync.config
3030
import user_sync.connector.directory
3131
import user_sync.connector.umapi
32-
from user_sync.error import AssertionException
3332
import user_sync.lockfile
3433
import user_sync.rules
34+
from user_sync.error import AssertionException
3535
from user_sync.version import __version__ as APP_VERSION
3636

3737
LOG_STRING_FORMAT = '%(asctime)s %(process)d %(levelname)s %(name)s - %(message)s'
@@ -85,6 +85,12 @@ def process_args():
8585
"When using this option, you must also specify what you want done with Adobe-only "
8686
"users by also including --adobe-only-user-action and one of its arguments",
8787
metavar='input_path', dest='stray_list_input_path')
88+
parser.add_argument('--config-file-encoding',
89+
help="config files are expected to contain only ASCII characters; if you "
90+
"use an extended character set (e.g., to specify group names), then "
91+
"specify the encoding of your configuration files with this argument. "
92+
"All encoding names understood by Python are allowed.",
93+
dest='encoding_name', default='ascii')
8894
return parser.parse_args()
8995

9096

@@ -137,7 +143,7 @@ def init_log(logging_config):
137143
fileHandler.setLevel(file_log_level)
138144
fileHandler.setFormatter(logging.Formatter(LOG_STRING_FORMAT, LOG_DATE_FORMAT))
139145
logging.getLogger().addHandler(fileHandler)
140-
if (unknown_file_log_level == True):
146+
if unknown_file_log_level:
141147
logger.log(logging.WARNING, 'Unknown file log level: %s setting to info' % options['file_log_level'])
142148

143149

@@ -200,6 +206,7 @@ def begin_work(config_loader):
200206
def create_config_loader(args):
201207
config_bootstrap_options = {
202208
'main_config_filename': args.config_filename,
209+
'config_file_encoding': args.encoding_name,
203210
}
204211
config_loader = user_sync.config.ConfigLoader(config_bootstrap_options)
205212
return config_loader

user_sync/config.py

Lines changed: 26 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
1919
# SOFTWARE.
2020

21+
import codecs
2122
import logging
2223
import os
2324
import re
@@ -41,6 +42,7 @@ def __init__(self, caller_options):
4142
self.options = options = {
4243
# these are in alphabetical order! Always add new ones that way!
4344
'delete_strays': False,
45+
'config_file_encoding': 'ascii',
4446
'directory_connector_module_name': None,
4547
'directory_connector_overridden_options': None,
4648
'directory_group_filter': None,
@@ -56,14 +58,15 @@ def __init__(self, caller_options):
5658
'update_user_info': True,
5759
'username_filter_regex': None,
5860
}
59-
options.update(caller_options)
60-
61+
options.update(caller_options)
6162
main_config_filename = options.get('main_config_filename')
63+
config_encoding = options['config_file_encoding']
64+
try:
65+
codecs.lookup(config_encoding)
66+
except LookupError:
67+
raise AssertionException("Unknown encoding '%s' specified with --config-file-encoding" % config_encoding)
68+
ConfigFileLoader.config_encoding = config_encoding
6269
main_config_content = ConfigFileLoader.load_root_config(main_config_filename)
63-
64-
if (not os.path.isfile(main_config_filename)):
65-
raise AssertionException('Config file does not exist: %s' % (main_config_filename))
66-
6770
self.logger = logger = logging.getLogger('config')
6871
logger.info("Using main config file: %s", main_config_filename)
6972
self.main_config = DictConfig("<%s>" % main_config_filename, main_config_content)
@@ -606,6 +609,10 @@ class ConfigFileLoader:
606609
'''
607610
Loads config files and does pathname expansion on settings that refer to files or directories
608611
'''
612+
# config files can contain Unicode characters, so an encoding for them
613+
# can be specified as a command line argument. This defaults to ascii.
614+
config_encoding = 'ascii'
615+
609616
# key_paths in the root configuration file that should have filename values
610617
# mapped to their value options. See load_from_yaml for the option meanings.
611618
ROOT_CONFIG_PATH_KEYS = {'/adobe_users/connectors/umapi': (True, True, None),
@@ -680,9 +687,11 @@ def load_from_yaml(cls, filename, path_keys):
680687
cmd = filename[3:-1]
681688
try:
682689
bytes = subprocess.check_output(cmd, cwd=dir, shell=True)
683-
yml = yaml.load(bytes)
690+
yml = yaml.load(bytes.decode(cls.config_encoding, 'strict'))
684691
except subprocess.CalledProcessError as e:
685692
raise AssertionException("Error executing process '%s' in dir '%s': %s" % (cmd, dir, e))
693+
except UnicodeDecodeError as e:
694+
raise AssertionException('Encoding error in process output: %s' % e)
686695
except yaml.error.MarkedYAMLError as e:
687696
raise AssertionException('Error parsing process YAML data: %s' % e)
688697
else:
@@ -693,17 +702,20 @@ def load_from_yaml(cls, filename, path_keys):
693702
cls.filename = os.path.split(cls.filepath)[1]
694703
cls.dirpath = os.path.dirname(cls.filepath)
695704
try:
696-
with open(filename, 'r', 1) as input_file:
697-
yml = yaml.load(input_file)
705+
with open(filename, 'rb', 1) as input_file:
706+
bytes = input_file.read()
707+
yml = yaml.load(bytes.decode(cls.config_encoding, 'strict'))
698708
except IOError as e:
699709
# if a file operation error occurred while loading the
700-
# configuration file, swallow up the exception and re-raise this
710+
# configuration file, swallow up the exception and re-raise it
701711
# as an configuration loader exception.
702-
raise AssertionException('Error reading configuration file: %s' % e)
712+
raise AssertionException("Error reading configuration file '%s': %s" % (cls.filepath, e))
713+
except UnicodeDecodeError as e:
714+
# as above, but in case of encoding errors
715+
raise AssertionException("Encoding error in configuration file '%s: %s" % (cls.filepath, e))
703716
except yaml.error.MarkedYAMLError as e:
704-
# same as above, but indicate this problem has to do with
705-
# parsing the configuration file.
706-
raise AssertionException('Error parsing configuration file: %s' % e)
717+
# as above, but in case of parse errors
718+
raise AssertionException("Error parsing configuration file '%s': %s" % (cls.filepath, e))
707719

708720
# process the content of the dict
709721
for path_key, options in path_keys.iteritems():

user_sync/connector/directory_csv.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ def __init__(self, caller_options):
5656
caller_config = user_sync.config.DictConfig('%s configuration' % self.name, caller_options)
5757
builder = user_sync.config.OptionsBuilder(caller_config)
5858
builder.set_string_value('delimiter', None)
59+
builder.set_string_value('string_encoding', 'utf-8')
5960
builder.set_string_value('first_name_column_name', 'firstname')
6061
builder.set_string_value('last_name_column_name', 'lastname')
6162
builder.set_string_value('email_column_name', 'email')
@@ -73,6 +74,8 @@ def __init__(self, caller_options):
7374
logger.debug('%s initialized with options: %s', self.name, options)
7475
caller_config.report_unused_values(logger)
7576

77+
# encoding of column values
78+
self.encoding = options['string_encoding']
7679
# identity type for new users if not specified in column
7780
self.user_identity_type = user_sync.identity_type.parse_identity_type(options['user_identity_type'])
7881

@@ -190,7 +193,4 @@ def get_column_value(self, row, column_name):
190193
:type column_name: str
191194
'''
192195
value = row.get(column_name)
193-
if (value == ''):
194-
value = None
195-
return value
196-
196+
return value.decode(self.encoding) if value else None

user_sync/connector/directory_ldap.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020

2121
import string
2222

23-
import keyring
2423
import ldap.controls.libldap
2524

2625
import user_sync.config
@@ -64,6 +63,7 @@ def __init__(self, caller_options):
6463
builder.set_string_value('group_filter_format', '(&(|(objectCategory=group)(objectClass=groupOfNames)(objectClass=posixGroup))(cn={group}))')
6564
builder.set_string_value('all_users_filter', '(&(objectClass=user)(objectCategory=person)(!(userAccountControl:1.2.840.113556.1.4.803:=2)))')
6665
builder.set_bool_value('require_tls_cert', False)
66+
builder.set_string_value('string_encoding', 'utf-8')
6767
builder.set_string_value('user_identity_type_format', None)
6868
builder.set_string_value('user_email_format', '{mail}')
6969
builder.set_string_value('user_username_format', None)
@@ -79,6 +79,7 @@ def __init__(self, caller_options):
7979
self.logger = logger = user_sync.connector.helper.create_logger(options)
8080
logger.debug('%s initialized with options: %s', self.name, options)
8181

82+
LDAPValueFormatter.encoding = options['string_encoding']
8283
self.user_identity_type = user_sync.identity_type.parse_identity_type(options['user_identity_type'])
8384
self.user_identity_type_formatter = LDAPValueFormatter(options['user_identity_type_format'])
8485
self.user_email_formatter = LDAPValueFormatter(options['user_email_format'])
@@ -367,19 +368,20 @@ def iter_search_result(self, base_dn, scope, filter_string, attributes):
367368
raise
368369

369370
class LDAPValueFormatter(object):
371+
encoding = 'utf-8'
372+
370373
def __init__(self, string_format):
371374
'''
372375
:type string_format: str
373-
'''
374-
if (string_format == None):
376+
'''
377+
if (string_format == None):
375378
attribute_names = []
376379
else:
377380
formatter = string.Formatter()
378381
attribute_names = [item[1] for item in formatter.parse(string_format) if item[1]]
379-
380382
self.string_format = string_format
381383
self.attribute_names = attribute_names
382-
384+
383385
def get_attribute_names(self):
384386
'''
385387
:rtype list(str)
@@ -402,17 +404,17 @@ def generate_value(self, record):
402404
break
403405
values[attribute_name] = value
404406
if values is not None:
405-
result = self.string_format.format(**values)
407+
result = self.string_format.format(**values).decode(self.encoding)
406408
return (result, attribute_name)
407409

408-
@staticmethod
409-
def get_attribute_value(attributes, attribute_name):
410+
@classmethod
411+
def get_attribute_value(cls, attributes, attribute_name):
410412
'''
411413
:type attributes: dict
412414
:type attribute_name: str
413415
'''
414416
if attribute_name in attributes:
415417
attribute_value = attributes[attribute_name]
416418
if (len(attribute_value) > 0):
417-
return attribute_value[0]
419+
return attribute_value[0].decode(cls.encoding)
418420
return None

user_sync/connector/umapi.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ def __init__(self, name, caller_options):
6565
options['enterprise'] = enterprise_options = enterprise_builder.get_options()
6666
self.options = options
6767
self.logger = logger = helper.create_logger(options)
68-
server_config.report_unused_values(logger)
68+
if server_config: server_config.report_unused_values(logger)
6969
logger.debug('UMAPI initialized with options: %s', options)
7070

7171
# set up the auth dict for umapi-client

user_sync/helper.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,11 @@ def open_file(name, mode, buffering = -1):
3737

3838
def normalize_string(string_value):
3939
'''
40-
:type string_value: str
40+
Normalize a unicode or regular string
41+
:param string_value: either a unicode or regular string or None
42+
:return: the same type that came in
4143
'''
42-
return string_value.strip().lower() if string_value != None else None
44+
return string_value.strip().lower() if string_value is not None else None
4345

4446
def guess_delimiter_from_filename(filename):
4547
'''

user_sync/rules.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ def __init__(self, caller_options):
9393
# in the secondary umapis (and exclude all that don't match). Finally,
9494
# we keep track of user keys (in any umapi) that we have updated, so
9595
# we can correctly report their count.
96+
self.adobe_user_count = 0
9697
self.included_user_keys = set()
9798
self.excluded_user_count = 0
9899
self.updated_user_keys = set()
@@ -172,7 +173,7 @@ def log_action_summary(self, umapi_connectors):
172173
self.action_summary['directory_users_read'] = len(self.directory_user_by_user_key)
173174
self.action_summary['directory_users_selected'] = len(self.filtered_directory_user_by_user_key)
174175
# find the total number of adobe users and excluded users
175-
self.action_summary['adobe_users_read'] = len(self.included_user_keys) + self.excluded_user_count
176+
self.action_summary['adobe_users_read'] = self.adobe_user_count
176177
self.action_summary['adobe_users_excluded'] = self.excluded_user_count
177178
self.action_summary['adobe_users_updated'] = len(self.updated_user_keys)
178179
# find out the number of users that have no changes; this depends on whether
@@ -752,6 +753,7 @@ def update_umapi_users_for_connector(self, umapi_info, umapi_connector):
752753

753754
def is_umapi_user_excluded(self, in_primary_org, user_key, current_groups):
754755
if in_primary_org:
756+
self.adobe_user_count += 1
755757
# in the primary umapi, we actually check the exclusion conditions
756758
identity_type, username, domain = self.parse_user_key(user_key)
757759
if identity_type in self.exclude_identity_types:
@@ -886,7 +888,7 @@ def get_user_key(self, id_type, username, domain, email=None):
886888
domain = ""
887889
elif not domain:
888890
return None
889-
return id_type + ',' + username + ',' + domain
891+
return unicode(id_type) + u',' + unicode(username) + u',' + unicode(domain)
890892

891893
def parse_user_key(self, user_key):
892894
'''Returns the identity_type, username, and domain for the user.

0 commit comments

Comments
 (0)