2017-03-28 4 views
0

J'ai le fichier xml avec une telle structure:Comment charger dans le fichier XML Spark 2.1 avec des balises et des attributs répétitifs?

<?xml version="1.0"?> 
<catalog> 
<new> 
    <book id="bk101" language="en"> 
     <author id="4452" primary="true">Gambardella, Matthew</author> 
     <title primary="true">XML Developer's Guide</title> 
     <genre primary="false">Computer</genre> 
     <publisher primary="true" id="US124"> 
     <firm id="4124">Amazon LLC</firm> 
     <address>NY, USA</address> 
     <email type="official">[email protected]</email> 
     <contact_person id="3351"> 
      <name>Rajesh K.</name> 
      <email type="personal">[email protected]</email> 
     </contact_person> 
     </publisher> 
    </book> 
    <book id="bk103" language="en"> 
     <author id="4452" primary="true">Corets, Eva</author> 
     <title primary="true">Maeve Ascendant</title> 
     <genre primary="false">Fantasy</genre> 
     <publisher primary="true" id="US136"> 
     <firm id="4524">Oreally LLC</firm> 
     <address>NY, USA</address> 
     <email type="official">[email protected]</email> 
     <contact_person id="1573"> 
      <name>Prajakta G.</name> 
      <email type="personal">[email protected]</email> 
     </contact_person> 
     </publisher> 
    </book> 
    </new> 
    <removed> 
    <book id="bk104" language="en"> 
     <author id="4452" primary="true">Corets, Eva</author> 
     <title primary="true">Oberon's Legacy</title> 
     <genre primary="false">Fantasy</genre> 
     <publisher primary="true" id="US137"> 
     <firm id="4524">Oreally LLC</firm> 
     <address>NY, USA</address> 
     <email type="official">[email protected]</email> 
     <contact_person id="1573"> 
      <name>Prajakta G.</name> 
      <email type="personal">[email protected]</email> 
     </contact_person> 
     </publisher> 
    </book> 
    </removed> 
</catalog> 

Comment pourrais-je charger dans un DataSet? J'ai essayé de suivre l'exemple from Databricks, mais j'ai reçu l'erreur: AnalysysException: Reference '_id' is ambiguous, could be: _id#1, _id#3

I've replaced in my StructType schema StructField '_id' to '_id#1', '_id#2' and so on,

mais j'ai reçu une autre erreur:

Exception in thread "main" java.lang.ExceptionInInitializerError 
       at org.apache.spark.SparkContext.withScope(SparkContext.scala:701) 
       at org.apache.spark.SparkContext.newAPIHadoopFile(SparkContext.scala:1094) 
       at com.databricks.spark.xml.util.XmlFile$.withCharset(XmlFile.scala:46) 
       at com.databricks.spark.xml.DefaultSource$$anonfun$createRelation$1.apply(DefaultSource.scala:62) 
       at com.databricks.spark.xml.DefaultSource$$anonfun$createRelation$1.apply(DefaultSource.scala:62) 
       at com.databricks.spark.xml.XmlRelation.buildScan(XmlRelation.scala:54) 
       at com.databricks.spark.xml.XmlRelation.buildScan(XmlRelation.scala:63) 
       at org.apache.spark.sql.execution.datasources.DataSourceStrategy$$anonfun$12.apply(DataSourceStrategy.scala:343) 
       at org.apache.spark.sql.execution.datasources.DataSourceStrategy$$anonfun$12.apply(DataSourceStrategy.scala:343) 
       at org.apache.spark.sql.execution.datasources.DataSourceStrategy$$anonfun$pruneFilterProject$1.apply(DataSourceStrategy.scala:384) 
       at org.apache.spark.sql.execution.datasources.DataSourceStrategy$$anonfun$pruneFilterProject$1.apply(DataSourceStrategy.scala:383) 
       at org.apache.spark.sql.execution.datasources.DataSourceStrategy$.pruneFilterProjectRaw(DataSourceStrategy.scala:464) 
       at org.apache.spark.sql.execution.datasources.DataSourceStrategy$.pruneFilterProject(DataSourceStrategy.scala:379) 
       at org.apache.spark.sql.execution.datasources.DataSourceStrategy$.apply(DataSourceStrategy.scala:339) 
       at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$1.apply(QueryPlanner.scala:62) 
       at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$1.apply(QueryPlanner.scala:62) 
       at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434) 
       at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440) 
       at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439) 
       at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:92) 
       at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$2$$anonfun$apply$2.apply(QueryPlanner.scala:77) 
       at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$2$$anonfun$apply$2.apply(QueryPlanner.scala:74) 
       at scala.collection.TraversableOnce$$anonfun$foldLeft$1.apply(TraversableOnce.scala:157) 
       at scala.collection.TraversableOnce$$anonfun$foldLeft$1.apply(TraversableOnce.scala:157) 
       at scala.collection.Iterator$class.foreach(Iterator.scala:893) 
       at scala.collection.AbstractIterator.foreach(Iterator.scala:1336) 
       at scala.collection.TraversableOnce$class.foldLeft(TraversableOnce.scala:157) 
       at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1336) 
       at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$2.apply(QueryPlanner.scala:74) 
       at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$2.apply(QueryPlanner.scala:66) 
       at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434) 
       at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440) 
       at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:92) 
       at org.apache.spark.sql.execution.QueryExecution.sparkPlan$lzycompute(QueryExecution.scala:79) 
       at org.apache.spark.sql.execution.QueryExecution.sparkPlan(QueryExecution.scala:75) 
       at org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:84) 
       at org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:84) 
       at org.apache.spark.sql.Dataset.withTypedCallback(Dataset.scala:2791) 
       at org.apache.spark.sql.Dataset.head(Dataset.scala:2112) 
       at org.apache.spark.sql.Dataset.take(Dataset.scala:2327) 
       at org.apache.spark.sql.Dataset.showString(Dataset.scala:248) 
       at org.apache.spark.sql.Dataset.show(Dataset.scala:636) 
       at org.apache.spark.sql.Dataset.show(Dataset.scala:595) 

Répondre

0

solution a été trouvée qui a résolu la deuxième erreur:

add in .pom file old jackson version: jackson-core v.2.6.7 and jackson-databind 2.6.7.