Commit de219697 authored by Claudio Cacciari's avatar Claudio Cacciari
Browse files

Merge branch 'metadata' into devel

parents dbbaac72 ba592e35
......@@ -11,6 +11,10 @@ import hashlib
from py2neo import Graph, Node, Relationship, authenticate, GraphError
#import httplib
#
#httplib.HTTPConnection._http_vsn = 10
#httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'
logger = logging.getLogger('GraphDBClient')
......@@ -41,6 +45,7 @@ class GraphDBClient():
logger.info("Connecting to " + self.conf.graphdb_addr
+ " with user " + self.conf.graphdb_user)
authenticate(self.conf.graphdb_addr, self.conf.graphdb_user,
self.conf.graphdb_passwd)
logger.debug("Authenticated, now connecting ...")
......@@ -52,7 +57,7 @@ class GraphDBClient():
logger.info('Node "Zone" ' + self.conf.irods_zone_name
+ ' not found, so it will be created')
try:
self.graph.schema.create_uniqueness_constraint("Digital Entity",
self.graph.schema.create_uniqueness_constraint("DigitalEntity",
"location")
self.graph.schema.create_uniqueness_constraint("Zone", "name")
self.graph.schema.create_uniqueness_constraint("Resource", "name")
......@@ -78,12 +83,177 @@ class GraphDBClient():
# dynamic data ###################################
def analyze(self, structuralMap):
logger.info('Compare the manifest with the DB graph')
self._analyzeStructuralMap(structuralMap)
def push(self, structuralMap):
logger.info('Start to upload metadata to the Graph DB')
logger.info('Start to upload the new metadata to the Graph DB')
structList = self._structRecursion(structuralMap)
def _analyzeStructuralMap(self, struct):
if len(struct['nestedObjects']) > 0:
agg = {'label':'Aggregation', 'location':'', 'name':struct['name'],
'checksum':'', 'nodetype':struct['type']}
graphAgg = self.graph.find_one('Aggregation', 'name', struct['name'])
if not graphAgg:
print 'Node Aggregation [{}] is missing'.format(struct['name'])
return [None]
else:
print 'Node Aggregation [{}] found'.format(struct['name'])
if len(struct['filePaths']) > 0:
if len(struct['filePaths']) == 1:
path = struct['filePaths'][0]
agg['location'] = path[7:]
absolutePath = self.root + '/' + path[7:]
sumValue = self.irodsu.getChecksum(absolutePath)
agg['checksum'] = sumValue
self._analyzeOwnershipRel(absolutePath, graphAgg)
self._analyzePIDRel(absolutePath, graphAgg)
self._analyzeResourceRel(absolutePath, graphAgg)
self._analyzeMasterRel(absolutePath, graphAgg)
self._analyzeReplicaRel(absolutePath, graphAgg)
else:
logger.warning('multiple file paths not allowed')
for elem in struct['nestedObjects']:
nodes = self._analyzeStructuralMap(elem)
for n in nodes:
self._searchRelationship(n, 'BELONGS_TO', graphAgg,
graphAgg.properties['name'])
return [graphAgg]
else:
leafs = []
if len(struct['filePaths']) > 0:
for fp in struct['filePaths']:
absolutePath = self.root + '/' + fp[7:]
sumValue = self.irodsu.getChecksum(absolutePath)
de = {'location':fp[7:], 'name':struct['name'],
'checksum':sumValue, 'nodetype':struct['type']}
graphEnt = self.graph.find_one('DigitalEntity', 'location',
fp[7:])
if not graphEnt:
print 'Node Digital Entity [{}] is missing'.format(fp[7:])
else:
print 'Node Digital Entity [{}] found'.format(fp[7:])
self._analyzeOwnershipRel(absolutePath, graphEnt)
self._analyzePIDRel(absolutePath, graphEnt)
self._analyzeResourceRel(absolutePath, graphEnt)
self._analyzeMasterRel(absolutePath, graphEnt)
self._analyzeReplicaRel(absolutePath, graphEnt)
leafs.append(graphEnt)
else:
agg = {'location':'', 'name':struct['name'], 'checksum':'',
'nodetype':struct['type']}
graphAgg = self.graph.find_one('Aggregation', 'name', struct['name'])
if not graphAgg:
print 'Node Aggregation [{}] is missing'.format(struct['name'])
else:
print 'Node Aggregation [{}] found'.format(struct['name'])
leafs.append(graphAgg)
return leafs
def _searchRelationship(self, start, rel, end, endProp):
graphRel = self.graph.match_one(start_node=start, rel_type=rel,
end_node=end)
message = 'Relationship [({},{}) {} {}]'.format(start.properties['name'],
start.properties['location'], rel, endProp)
if not graphRel:
print message + ' is missing'
else:
print message + ' found'
def _analyzeOwnershipRel(self, absolutePath, de):
owners = self.irodsu.getOwners(absolutePath)
if owners:
for owner in owners:
graphPers = self.graph.find_one('Person', 'name', owner)
if not graphPers:
print 'Node Person [{}] is missing'.format(owner)
else:
print 'Node Person [{}] found'.format(owner)
self._searchRelationship(de, 'IS_OWNED_BY', graphPers,
graphPers.properties['name'])
def _analyzePIDRel(self, absolutePath, de):
pids = self.irodsu.getMetadata(absolutePath, 'PID')
if pids and len(pids) > 0:
graphPid = self.graph.find_one('PersistentIdentifier', 'value', pids[0])
if not graphPid:
print 'Node PersistentIdentifier [{}] is missing'.format(pids[0])
else:
print 'Node PersistentIdentifier [{}] found'.format(pids[0])
self._searchRelationship(de, 'UNIQUELY_IDENTIFIED_BY', graphPid,
graphPid.properties['value'])
def _analyzeMasterRel(self, absolutePath, de):
replicas = self.irodsu.getMetadata(absolutePath, 'Replica')
for rpointer in replicas:
graphPo = self.graph.find_one('Pointer', 'value', rpointer)
if not graphPo:
print 'Node Pointer [{}] is missing'.format(rpointer)
else:
print 'Node Pointer [{}] found'.format(rpointer)
self._searchRelationship(de, 'IS_MASTER_OF', graphPo,
graphPo.properties['value'])
def _analyzeReplicaRel(self, absolutePath, de):
parent = None
masters = self.irodsu.getMetadata(absolutePath, 'ROR')
if masters and len(masters) > 0:
master = masters[0]
parents = self.irodsu.getMetadata(absolutePath, 'PPID')
if parents and len(parents) > 0:
parent = parents[0]
else:
return False
graphMas = self.graph.find_one('Pointer', 'value', master)
if not graphMas:
print 'Node Pointer [{}] is missing'.format(master)
else:
print 'Node Pointer [{}] found'.format(master)
self._searchRelationship(de, 'IS_REPLICA_OF', graphMas,
graphMas.properties['value'])
if parent:
graphPar = self.graph.find_one('Pointer', 'value', parent)
if not graphMas:
print 'Node Pointer [{}] is missing'.format(parent)
else:
print 'Node Pointer [{}] found'.format(parent)
self._searchRelationship(de, 'IS_REPLICA_OF', graphPar,
graphPar.properties['value'])
def _analyzeResourceRel(self, absolutePath, de):
resources = self.irodsu.getResources(absolutePath)
if resources:
for res in resources:
resN = self.graph.find_one('Resource', 'name', res)
if resN:
self._searchRelationship(de, 'IS_STORED_IN', resN,
resN.properties['name'])
def _structRecursion(self, d):
if len(d['nestedObjects']) > 0:
......@@ -103,14 +273,8 @@ class GraphDBClient():
agg.properties['location'] = path[7:]
agg.push()
logger.debug('Updated location of entity: ' + str(agg))
absolutePath = self.root + '/' + path[7:]
sumValue = self.irodsu.getChecksum(absolutePath)
if sumValue:
agg.properties['checksum'] = sumValue
agg.push()
logger.debug('Updated checksum of entity: ' + str(agg))
self._defineOwnershipRelation(agg)
self._definePIDRelation(agg)
agg = self._defineDigitalEntity(d['name'], path[7:],
d['type'], agg)
else:
logger.warning('multiple file paths not allowed')
......@@ -134,13 +298,18 @@ class GraphDBClient():
de = self._defineDigitalEntity(d['name'], fp[7:], d['type'])
leafs.append(de)
else:
de = self._defineDigitalEntity(d['name'], fp[7:], d['type'])
leafs.append(de)
path = ''
sumValue = ''
agg = self._createUniqueNode("Aggregation", d['name'],
path[7:],
sumValue,
d['type'])
leafs.append(agg)
return leafs
def _defineDigitalEntity(self, name, path, dtype, absolute=False):
def _defineDigitalEntity(self, name, path, dtype, de=None, absolute=False):
if self.conf.dryrun:
sumValue = ''
......@@ -149,18 +318,25 @@ class GraphDBClient():
if not absolute:
absolutePath = self.root + '/' + path
sumValue = self.irodsu.getChecksum(absolutePath)
if de is not None:
if sumValue:
de.properties['checksum'] = sumValue
de.push()
logger.debug('Updated checksum of entity: ' + str(de))
#TODO what if checksum is null?
de = self._createUniqueNode("Digital Entity",name,path,sumValue,dtype)
logger.debug('Created node: ' + str(de))
self._defineResourceRelation(de, absolute)
self._definePIDRelation(de, absolute)
self._defineMasterRelation(de, absolute)
self._defineReplicaRelation(de, absolute)
self._defineOwnershipRelation(de, absolute)
else:
de = self._createUniqueNode("DigitalEntity", name, path, sumValue,
dtype)
logger.debug('Created node: ' + str(de))
self._defineResourceRelation(de, absolutePath)
self._definePIDRelation(de, absolutePath)
self._defineMasterRelation(de, absolutePath)
self._defineReplicaRelation(de, absolutePath)
self._defineOwnershipRelation(de, absolutePath)
return de
def _definePIDRelation(self, de, absolute=False):
def _definePIDRelation(self, de, absolutePath):
if self.conf.dryrun:
print ("create the graph relation ["+str(de)+","
......@@ -169,13 +345,10 @@ class GraphDBClient():
path = de.properties["location"]
sumValue = de.properties["checksum"]
absolutePath = path
if not absolute:
absolutePath = self.root + '/' + path
pids = self.irodsu.getMetadata(absolutePath, 'PID')
if pids and len(pids) > 0:
pidNode = Node("Persistent Identifier", value = pids[0],
checksum = sumValue)
pidNode = Node("PersistentIdentifier", value = pids[0],
checksum = sumValue)
de_is_uniquely_identified_by_pid = Relationship(de,
"UNIQUELY_IDENTIFIED_BY",
pidNode)
......@@ -186,17 +359,13 @@ class GraphDBClient():
return False
def _defineMasterRelation(self, de, absolute=False):
def _defineMasterRelation(self, de, absolutePath):
if self.conf.dryrun:
print ("create the graph relation ["+str(de)+","
"IS_MASTER_OF,replica]")
return True
path = de.properties["location"]
absolutePath = path
if not absolute:
absolutePath = self.root + '/' + path
replicas = self.irodsu.getMetadata(absolutePath, 'Replica')
for rpointer in replicas:
po = self._createPointer('iRODS', rpointer)
......@@ -208,7 +377,7 @@ class GraphDBClient():
return False
def _defineReplicaRelation(self, re, absolute=False):
def _defineReplicaRelation(self, re, absolutePath):
if self.conf.dryrun:
print ("create the graph relation ["+str(re)+","
......@@ -216,10 +385,6 @@ class GraphDBClient():
return True
parent = None
path = re.properties["location"]
absolutePath = path
if not absolute:
absolutePath = self.root + '/' + path
masters = self.irodsu.getMetadata(absolutePath, 'ROR')
if masters and len(masters) > 0:
master = masters[0]
......@@ -244,7 +409,7 @@ class GraphDBClient():
return True
def _defineOwnershipRelation(self, de, absolute=False):
def _defineOwnershipRelation(self, de, absolutePath):
if self.conf.dryrun:
print ("create the graph relation ["+str(de)+",IS_OWNED_BY,"
......@@ -252,9 +417,6 @@ class GraphDBClient():
return True
path = de.properties["location"]
absolutePath = path
if not absolute:
absolutePath = self.root + '/' + path
owners = self.irodsu.getOwners(absolutePath)
logger.debug('Got the list of owners: ' + str(owners))
if owners:
......@@ -272,17 +434,13 @@ class GraphDBClient():
return False
def _defineResourceRelation(self, de, absolute=False):
def _defineResourceRelation(self, de, absolutePath):
if self.conf.dryrun:
print ("create the graph relation ["+str(de)+",IS_STORED_IN,"
"b2safe_resource_name]")
return True
path = de.properties["location"]
absolutePath = path
if not absolute:
absolutePath = self.root + '/' + path
resources = self.irodsu.getResources(absolutePath)
if resources:
for res in resources:
......@@ -297,7 +455,6 @@ class GraphDBClient():
def _createUniqueNode(self, eudat_type, name, path, checksum, d_type):
entityNew = Node(eudat_type, location = path,
name = name,
checksum = checksum,
......@@ -321,8 +478,8 @@ class GraphDBClient():
hashVal = hashlib.md5(value).hexdigest()
pointerNew = Node('Pointer', type = pointer_type,
value = value,
name = hashVal)
value = value,
name = hashVal)
if self.conf.dryrun:
print ("create the graph node ["+str(pointer)+"]")
else:
......@@ -420,7 +577,10 @@ def sync(args):
xmltext = irodsu.getFile(args.path + '/manifest.xml')
structuralMaps = mp.parse(xmltext)
for smap in structuralMaps:
gdbc.push(smap)
if args.analyze:
gdbc.analyze(smap)
else:
gdbc.push(smap)
logger.info("Sync completed")
......@@ -432,6 +592,8 @@ if __name__ == "__main__":
help="enable debug")
parser.add_argument("-d", "--dryrun", action="store_true",
help="run without performing any real change")
parser.add_argument("-a", "--analyze", action="store_true",
help="compare the manifest with the DB graph")
parser.add_argument("path", help="irods path to the data")
parser.set_defaults(func=sync)
......
......@@ -74,7 +74,8 @@ class MetsManifest():
with open(self.conf.md_jsonld_file, 'r') as f:
collStruct = json.load(f)
self.logger.debug('Building the METS structural map section')
smap = self.buildStructMap(ftree.keys()[0], collStruct)
rootName = ftree.keys()[0] + '_' + str(uuid.uuid4())
smap = self.buildStructMap(rootName, collStruct)
mf.structMap.append(smap)
self.manifest = mf
......@@ -134,7 +135,7 @@ class MetsManifest():
self.logger.debug('Building the METS structMapType')
# initialize the structural map section
smap = structMapType(TYPE="Relational")
divMain = divType(LABEL=rootName, TYPE="digital collection")
divMain = divType(LABEL=rootName, TYPE="digitalCollection")
temp_div = {}
temp_rel = {}
future_rel = {}
......@@ -199,7 +200,7 @@ class MetsManifest():
if 'isRelatedTo' in entity.keys():
# analyze the relations of this entity with others
label = 'rel_' + str(uuid.uuid4())
divRel = divType(LABEL=label, TYPE="entity relation")
divRel = divType(LABEL=label, TYPE="entityRelation")
divRel.append(div)
for relation in entity['isRelatedTo']:
normPathRel = relation['@id'][2:]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment