Changeset 1274


Ignore:
Timestamp:
Nov 2, 2011, 8:38:42 PM (12 years ago)
Author:
George Lilly
Message:

modification with blocking factor of 10000 nodes

File:
1 edited

Legend:

Unmodified
Added
Removed
  • fmts/trunk/p/C0XMAIN.m

    r1273 r1274  
    7777 K ZTMP
    7878 S ZLOC=$NA(^TMP("C0X","WGET",$J))
     79 K @ZLOC
    7980 S C0XSTART=$$NOW^XLFDT
    8081 W !,"STARTED: ",C0XSTART
     
    103104 D STORETXT(ZRDF,ZTXTNM,FARY)
    104105 W !,"ADDED: ",ZGRAPH," ",ZSUBJECT," fmts:rdfSource ",ZTXTNM
    105  D PROCESS(.G,ZRDF,ZNAME,ZGRAPH,FARY) ; PARSE AND INSERT THE RDF
     106 D PROCESS2(.G,ZRDF,ZNAME,ZGRAPH,FARY) ; PARSE AND INSERT THE RDF
    106107 Q
    107108 ;
     
    258259 Q
    259260 ;
     261PROCESS2(ZRTN,ZRDF,ZGRF,ZMETA,FARY) ; PROCESS AN INCOMING RDF FILE
     262 ; ZRTN IS PASS BY REFERENCE AND RETURNS MESSAGES ABOUT THE PROCESSING
     263 ; ZRDF IS PASSED BY NAME AND IS THE GLOBAL CONTAINING THE RDF FILE
     264 ; ZGRF IS THE NAME OF THE GRAPH TO USE IN THE TRIPLE STORE FOR RESULTS
     265 ; ZMETA IS OPTIONAL AND IS THE NAME OF THE GRAPH TO STORE METADATA
     266 ;
     267 I '$D(FARY) D  ;
     268 . D INITFARY("C0XFARY")
     269 . S FARY="C0XFARY"
     270 D USEFARY(FARY)
     271 ;N BATCNT
     272 ;N BATMAX
     273 S BATCNT=0 ; BATCH COUNTER
     274 S BATMAX=10000 ; TRY BATCHES OF THIS SIZE
     275 ; -- first parse the rdf file with the MXML parser
     276 ;S C0XDOCID=$$PARSE^C0CNHIN(ZRDF,"C0XARRAY") ; PARSE WITH MXML
     277 S C0XDOCID=$$EN^MXMLDOM(ZRDF,"W")
     278 ; -- assign the MXLM dom global name to ZDOM
     279 S ZDOM=$NA(^TMP("MXMLDOM",$J,C0XDOCID))
     280 W !,$O(@ZDOM@(""),-1)," XML NODES PARSED"
     281 ; -- populate the metagraph to point to the graph with status unfinished
     282 S METAS=$$ANONS ; GET AN ANONOMOUS RANDOM SUBJECT
     283 I '$D(ZMETA) S ZMETA="_:G"_$$LKY9 ; RANDOM GRAPH NAME FOR METAGRAPH
     284 D ADD(ZMETA,METAS,"fmts:about",ZGRF,FARY) ; POINT THE META TO THE GRAPH
     285 D ADD(ZMETA,METAS,"fmts:status","unfinished",FARY) ; mark as unfinished
     286 ;S C0XDATE=$$FMDTOUTC^C0CUTIL($$NOW^XLFDT,"DT")
     287 S C0XDATE=$$NOW^XLFDT
     288 D ADD(ZMETA,METAS,"fmts:dateTime",C0XDATE,FARY)
     289 D UPDIE(.C0XFDA) ; commit the metagraph changes to the triple store
     290 ; --
     291 ; -- pull out the vocabularies in the RDF statement. marked with xmlns:
     292 ; -- put them in a local variable for quick reference
     293 ; -- TODO: create a graph for vocabularies and validate incoming against it
     294 ;
     295 S C0XVOC=""
     296 N ZI,ZJ,ZK S ZI=""
     297 F  S ZI=$O(@ZDOM@(1,"A",ZI)) Q:ZI=""  D  ; FOR EACH xmlns
     298 . S ZVOC=$P(ZI,"xmlns:",2)
     299 . I ZVOC'="" S C0XVOC(ZVOC)=$G(@ZDOM@(1,"A",ZI))
     300 ;W !,"VOCABS:" ZWR C0XVOC
     301 ;
     302 ; -- look for children called rdf:Description. quit if none. not an rdf file
     303 ;
     304 S ZI=$O(@ZDOM@(1,"C",""))
     305 I $G(@ZDOM@(1,"C",ZI))'="rdf:Description" D  Q  ; not an rdf file
     306 . W !,"Error. Not an RDF file. Cannot process."
     307 ;
     308 ; -- now process the rdf description children
     309 ;
     310 S ZI=""
     311 S (C0XSUB,C0XPRE,C0XOBJ)="" ; INITIALIZE subject, object and predicate
     312 F  S ZI=$O(@ZDOM@(1,"C",ZI)) Q:ZI=""  D  ;
     313 . ; -- we are skipping any child that is not rdf:Description
     314 . ; -- TODO: check to see if this is right in general
     315 . ;
     316 . IF $G(@ZDOM@(1,"C",ZI))'="rdf:Description" D  Q  ;
     317 . . W !,"SKIPPING NODE: ",ZI
     318 . ; -- now looking for the subject for the triples
     319 . S ZX=$G(@ZDOM@(ZI,"A","rdf:about"))
     320 . I ZX'="" D  ; we have the subject
     321 . . ;W " about: ",ZX
     322 . . S C0XSUB=ZX
     323 . E  D  ;
     324 . . S ZX=$G(@ZDOM@(ZI,"A","rdf:nodeID")) ; node id is another style of subject
     325 . . I ZX'="" D  ;
     326 . . . S C0XSUB=ZX
     327 . I C0XSUB="" S C0XSUB=$$ANONS ; DEFAULT TO BLANK SUBJECT
     328 . ;
     329 . ; -- we now have the subject. the children of this node have the rest
     330 . ;
     331 . S ZJ="" ; for the children of the rdf:Description nodes
     332 . F  S ZJ=$O(@ZDOM@(ZI,"C",ZJ)) Q:ZJ=""  D  ; for each child
     333 . . S C0XPRE=@ZDOM@(ZJ) ; the predicate without a prefix
     334 . . S ZX=$G(@ZDOM@(ZJ,"A","xmlns")) ; name space
     335 . . I ZX'="" S C0XPRE=ZX_C0XPRE ; add the namespace prefix
     336 . . I C0XPRE[":" D  ; expand using vocabulary
     337 . . . N ZB,ZA
     338 . . . S ZB=$P(C0XPRE,":",1)
     339 . . . S ZA=$P(C0XPRE,":",2)
     340 . . . I $G(C0XVOC(ZB))'="" D  ;
     341 . . . . S C0XPRE=C0XVOC(ZB)_ZA ; expanded
     342 . . S ZY=$G(@ZDOM@(ZJ,"A","rdf:resource")) ; potential object
     343 . . I ZY'="" D  Q ;
     344 . . . S C0XOBJ=ZY ; object
     345 . . . D ADD2(ZGRF,C0XSUB,C0XPRE,C0XOBJ) ; finally. our first real triple
     346 . . ; -- this is an else because of the quit above
     347 . . S ZX=$G(@ZDOM@(ZJ,"A","rdf:nodeID")) ; fishing for nodeId object
     348 . . I ZX'="" D  Q  ; got one
     349 . . . S C0XOBJ=ZX ; we are using the incoming nodeIDs as object/subject
     350 . . . ; without change... this could be foolish .. look at it again later
     351 . . . D ADD2(ZGRF,C0XSUB,C0XPRE,C0XOBJ) ; go for it and add a node
     352 . . S C0XOBJ=$G(@ZDOM@(ZJ,"T",1)) ; hopefully an object is here
     353 . . I C0XOBJ="" D  Q  ; not a happy situation
     354 . . . W !,"ERROR, NO OBJECT FOUND FOR NODE: ",ZJ
     355 . . D ADD2(ZGRF,C0XSUB,C0XPRE,C0XOBJ) ; go for it and add a node
     356 W !,"INSERTING ",C0XCNT," TRIPLES"
     357 I $D(C0XFDA) D UPDIE(.C0XFDA) ; commit the updates to the file
     358 ; next, mark the graph as finished
     359 S C0XEND=$$NOW^XLFDT
     360 W !," ENDED AT: ",C0XEND
     361 S C0XDIFF=$$FMDIFF^XLFDT(C0XEND,C0XSTART,2)
     362 W !," ELAPSED TIME: ",C0XDIFF," SECONDS"
     363 W !," APPROXIMATELY ",$P(C0XCNT/C0XDIFF,".")," TRIPLES PER SECOND"
     364 Q
     365 ;
    260366SHOW(ZN) ;
    261367 ZWR ^TMP("MXMLDOM",$J,1,ZN,*)
     
    301407 Q
    302408 ;
     409ADD2(ZG,ZS,ZP,ZO,FARY) ; ADD A TRIPLE TO THE TRIPLESTORE. ALL VALUES ARE TEXT
     410 ; THE FDA IS SET UP BUT THE FILES ARE NOT UPDATED. CALL UPDIE TO COMPLETE
     411 I '$D(FARY) D  ;
     412 . D INITFARY("C0XFARY")
     413 . S FARY="C0XFARY"
     414 D USEFARY(FARY)
     415 I '$D(C0XCNT) S C0XCNT=0
     416 N ZNODE
     417 S ZNODE="N"_$$LKY17
     418 N ZNARY ; GET READY TO CALL IENOFA
     419 S ZNARY("ZG",ZG)=""
     420 S ZNARY("ZS",ZS)=""
     421 S ZNARY("ZP",ZP)=""
     422 S ZNARY("ZO",ZO)=""
     423 D IENOFA(.ZIENS,.ZNARY,FARY) ; RESOLVE/ADD STRINGS
     424 ;S ZGIEN=$$IENOF(ZG) ; LAYGO TO GET IEN
     425 ;S ZSIEN=$$IENOF(ZS)
     426 ;S ZPIEN=$$IENOF(ZP)
     427 ;S ZOIEN=$$IENOF(ZO)
     428 ;I $D(C0XFDA) D UPDIE ; ADD THE STRINGS IF NEEDED
     429 S C0XCNT=C0XCNT+1
     430 S C0XFDA(C0XTFN,"?+"_C0XCNT_",",.01)=ZNODE
     431 S C0XFDA(C0XTFN,"?+"_C0XCNT_",",.02)=$O(ZIENS("IEN","ZG",""))
     432 S C0XFDA(C0XTFN,"?+"_C0XCNT_",",.03)=$O(ZIENS("IEN","ZS",""))
     433 S C0XFDA(C0XTFN,"?+"_C0XCNT_",",.04)=$O(ZIENS("IEN","ZP",""))
     434 S C0XFDA(C0XTFN,"?+"_C0XCNT_",",.05)=$O(ZIENS("IEN","ZO",""))
     435 S BATCNT=BATCNT+1
     436 I BATCNT=BATMAX D  ; BATCH IS DONE
     437 . D UPDIE(.C0XFDA)
     438 . K C0XFDA
     439 . S BATCNT=0 ; RESET COUNTER
     440 ; REMEMBER TO CALL UPDIE WHEN YOU'RE DONE
     441 Q
     442 ;
    303443LKY9() ;EXTRINIC THAT RETURNS A RANDOM 9 DIGIT NUMBER. USED FOR GENERATING
    304444 ; UNIQUE NODE AND GRAPH NAMES
Note: See TracChangeset for help on using the changeset viewer.