How to partially normalize a table? - pivot-table

Let's say I have the table:
M1 M2 M3
S1 S2 S1 S2 S1 S2
P1 D1 T1 0 1 2 3 4 5
P2 D2 T2 6 7 8 9 10 11
P3 D3 T3 12 13 14 15 16 17
The XML Input:
<?xml version="1.0" encoding="UTF-8"?>
<transformation>
<info>
<name>test</name>
<description/>
<extended_description/>
<trans_version/>
<trans_type>Normal</trans_type>
<trans_status>0</trans_status>
<directory>/</directory>
<parameters>
</parameters>
<log>
<trans-log-table>
<connection/>
<schema/>
<table/>
<size_limit_lines/>
<interval/>
<timeout_days/>
<field>
<id>ID_BATCH</id>
<enabled>Y</enabled>
<name>ID_BATCH</name>
</field>
<field>
<id>CHANNEL_ID</id>
<enabled>Y</enabled>
<name>CHANNEL_ID</name>
</field>
<field>
<id>TRANSNAME</id>
<enabled>Y</enabled>
<name>TRANSNAME</name>
</field>
<field>
<id>STATUS</id>
<enabled>Y</enabled>
<name>STATUS</name>
</field>
<field>
<id>LINES_READ</id>
<enabled>Y</enabled>
<name>LINES_READ</name>
<subject/>
</field>
<field>
<id>LINES_WRITTEN</id>
<enabled>Y</enabled>
<name>LINES_WRITTEN</name>
<subject/>
</field>
<field>
<id>LINES_UPDATED</id>
<enabled>Y</enabled>
<name>LINES_UPDATED</name>
<subject/>
</field>
<field>
<id>LINES_INPUT</id>
<enabled>Y</enabled>
<name>LINES_INPUT</name>
<subject/>
</field>
<field>
<id>LINES_OUTPUT</id>
<enabled>Y</enabled>
<name>LINES_OUTPUT</name>
<subject/>
</field>
<field>
<id>LINES_REJECTED</id>
<enabled>Y</enabled>
<name>LINES_REJECTED</name>
<subject/>
</field>
<field>
<id>ERRORS</id>
<enabled>Y</enabled>
<name>ERRORS</name>
</field>
<field>
<id>STARTDATE</id>
<enabled>Y</enabled>
<name>STARTDATE</name>
</field>
<field>
<id>ENDDATE</id>
<enabled>Y</enabled>
<name>ENDDATE</name>
</field>
<field>
<id>LOGDATE</id>
<enabled>Y</enabled>
<name>LOGDATE</name>
</field>
<field>
<id>DEPDATE</id>
<enabled>Y</enabled>
<name>DEPDATE</name>
</field>
<field>
<id>REPLAYDATE</id>
<enabled>Y</enabled>
<name>REPLAYDATE</name>
</field>
<field>
<id>LOG_FIELD</id>
<enabled>Y</enabled>
<name>LOG_FIELD</name>
</field>
<field>
<id>EXECUTING_SERVER</id>
<enabled>N</enabled>
<name>EXECUTING_SERVER</name>
</field>
<field>
<id>EXECUTING_USER</id>
<enabled>N</enabled>
<name>EXECUTING_USER</name>
</field>
<field>
<id>CLIENT</id>
<enabled>N</enabled>
<name>CLIENT</name>
</field>
</trans-log-table>
<perf-log-table>
<connection/>
<schema/>
<table/>
<interval/>
<timeout_days/>
<field>
<id>ID_BATCH</id>
<enabled>Y</enabled>
<name>ID_BATCH</name>
</field>
<field>
<id>SEQ_NR</id>
<enabled>Y</enabled>
<name>SEQ_NR</name>
</field>
<field>
<id>LOGDATE</id>
<enabled>Y</enabled>
<name>LOGDATE</name>
</field>
<field>
<id>TRANSNAME</id>
<enabled>Y</enabled>
<name>TRANSNAME</name>
</field>
<field>
<id>STEPNAME</id>
<enabled>Y</enabled>
<name>STEPNAME</name>
</field>
<field>
<id>STEP_COPY</id>
<enabled>Y</enabled>
<name>STEP_COPY</name>
</field>
<field>
<id>LINES_READ</id>
<enabled>Y</enabled>
<name>LINES_READ</name>
</field>
<field>
<id>LINES_WRITTEN</id>
<enabled>Y</enabled>
<name>LINES_WRITTEN</name>
</field>
<field>
<id>LINES_UPDATED</id>
<enabled>Y</enabled>
<name>LINES_UPDATED</name>
</field>
<field>
<id>LINES_INPUT</id>
<enabled>Y</enabled>
<name>LINES_INPUT</name>
</field>
<field>
<id>LINES_OUTPUT</id>
<enabled>Y</enabled>
<name>LINES_OUTPUT</name>
</field>
<field>
<id>LINES_REJECTED</id>
<enabled>Y</enabled>
<name>LINES_REJECTED</name>
</field>
<field>
<id>ERRORS</id>
<enabled>Y</enabled>
<name>ERRORS</name>
</field>
<field>
<id>INPUT_BUFFER_ROWS</id>
<enabled>Y</enabled>
<name>INPUT_BUFFER_ROWS</name>
</field>
<field>
<id>OUTPUT_BUFFER_ROWS</id>
<enabled>Y</enabled>
<name>OUTPUT_BUFFER_ROWS</name>
</field>
</perf-log-table>
<channel-log-table>
<connection/>
<schema/>
<table/>
<timeout_days/>
<field>
<id>ID_BATCH</id>
<enabled>Y</enabled>
<name>ID_BATCH</name>
</field>
<field>
<id>CHANNEL_ID</id>
<enabled>Y</enabled>
<name>CHANNEL_ID</name>
</field>
<field>
<id>LOG_DATE</id>
<enabled>Y</enabled>
<name>LOG_DATE</name>
</field>
<field>
<id>LOGGING_OBJECT_TYPE</id>
<enabled>Y</enabled>
<name>LOGGING_OBJECT_TYPE</name>
</field>
<field>
<id>OBJECT_NAME</id>
<enabled>Y</enabled>
<name>OBJECT_NAME</name>
</field>
<field>
<id>OBJECT_COPY</id>
<enabled>Y</enabled>
<name>OBJECT_COPY</name>
</field>
<field>
<id>REPOSITORY_DIRECTORY</id>
<enabled>Y</enabled>
<name>REPOSITORY_DIRECTORY</name>
</field>
<field>
<id>FILENAME</id>
<enabled>Y</enabled>
<name>FILENAME</name>
</field>
<field>
<id>OBJECT_ID</id>
<enabled>Y</enabled>
<name>OBJECT_ID</name>
</field>
<field>
<id>OBJECT_REVISION</id>
<enabled>Y</enabled>
<name>OBJECT_REVISION</name>
</field>
<field>
<id>PARENT_CHANNEL_ID</id>
<enabled>Y</enabled>
<name>PARENT_CHANNEL_ID</name>
</field>
<field>
<id>ROOT_CHANNEL_ID</id>
<enabled>Y</enabled>
<name>ROOT_CHANNEL_ID</name>
</field>
</channel-log-table>
<step-log-table>
<connection/>
<schema/>
<table/>
<timeout_days/>
<field>
<id>ID_BATCH</id>
<enabled>Y</enabled>
<name>ID_BATCH</name>
</field>
<field>
<id>CHANNEL_ID</id>
<enabled>Y</enabled>
<name>CHANNEL_ID</name>
</field>
<field>
<id>LOG_DATE</id>
<enabled>Y</enabled>
<name>LOG_DATE</name>
</field>
<field>
<id>TRANSNAME</id>
<enabled>Y</enabled>
<name>TRANSNAME</name>
</field>
<field>
<id>STEPNAME</id>
<enabled>Y</enabled>
<name>STEPNAME</name>
</field>
<field>
<id>STEP_COPY</id>
<enabled>Y</enabled>
<name>STEP_COPY</name>
</field>
<field>
<id>LINES_READ</id>
<enabled>Y</enabled>
<name>LINES_READ</name>
</field>
<field>
<id>LINES_WRITTEN</id>
<enabled>Y</enabled>
<name>LINES_WRITTEN</name>
</field>
<field>
<id>LINES_UPDATED</id>
<enabled>Y</enabled>
<name>LINES_UPDATED</name>
</field>
<field>
<id>LINES_INPUT</id>
<enabled>Y</enabled>
<name>LINES_INPUT</name>
</field>
<field>
<id>LINES_OUTPUT</id>
<enabled>Y</enabled>
<name>LINES_OUTPUT</name>
</field>
<field>
<id>LINES_REJECTED</id>
<enabled>Y</enabled>
<name>LINES_REJECTED</name>
</field>
<field>
<id>ERRORS</id>
<enabled>Y</enabled>
<name>ERRORS</name>
</field>
<field>
<id>LOG_FIELD</id>
<enabled>N</enabled>
<name>LOG_FIELD</name>
</field>
</step-log-table>
<metrics-log-table>
<connection/>
<schema/>
<table/>
<timeout_days/>
<field>
<id>ID_BATCH</id>
<enabled>Y</enabled>
<name>ID_BATCH</name>
</field>
<field>
<id>CHANNEL_ID</id>
<enabled>Y</enabled>
<name>CHANNEL_ID</name>
</field>
<field>
<id>LOG_DATE</id>
<enabled>Y</enabled>
<name>LOG_DATE</name>
</field>
<field>
<id>METRICS_DATE</id>
<enabled>Y</enabled>
<name>METRICS_DATE</name>
</field>
<field>
<id>METRICS_CODE</id>
<enabled>Y</enabled>
<name>METRICS_CODE</name>
</field>
<field>
<id>METRICS_DESCRIPTION</id>
<enabled>Y</enabled>
<name>METRICS_DESCRIPTION</name>
</field>
<field>
<id>METRICS_SUBJECT</id>
<enabled>Y</enabled>
<name>METRICS_SUBJECT</name>
</field>
<field>
<id>METRICS_TYPE</id>
<enabled>Y</enabled>
<name>METRICS_TYPE</name>
</field>
<field>
<id>METRICS_VALUE</id>
<enabled>Y</enabled>
<name>METRICS_VALUE</name>
</field>
</metrics-log-table>
</log>
<maxdate>
<connection/>
<table/>
<field/>
<offset>0.0</offset>
<maxdiff>0.0</maxdiff>
</maxdate>
<size_rowset>10000</size_rowset>
<sleep_time_empty>50</sleep_time_empty>
<sleep_time_full>50</sleep_time_full>
<unique_connections>N</unique_connections>
<feedback_shown>Y</feedback_shown>
<feedback_size>50000</feedback_size>
<using_thread_priorities>Y</using_thread_priorities>
<shared_objects_file/>
<capture_step_performance>N</capture_step_performance>
<step_performance_capturing_delay>1000</step_performance_capturing_delay>
<step_performance_capturing_size_limit>100</step_performance_capturing_size_limit>
<dependencies>
</dependencies>
<partitionschemas>
</partitionschemas>
<slaveservers>
</slaveservers>
<clusterschemas>
</clusterschemas>
<created_user>-</created_user>
<created_date>2018/03/12 14:50:53.529</created_date>
<modified_user>-</modified_user>
<modified_date>2018/03/12 14:50:53.529</modified_date>
<key_for_session_key/>
<is_key_private>N</is_key_private>
</info>
<notepads>
</notepads>
<order>
</order>
<step>
<name>Data Grid</name>
<type>DataGrid</type>
<description/>
<distribute>Y</distribute>
<custom_distribution/>
<copies>1</copies>
<partitioning>
<method>none</method>
<schema_name/>
</partitioning>
<fields>
<field>
<name>Product</name>
<type>String</type>
<format/>
<currency/>
<decimal/>
<group/>
<length>-1</length>
<precision>-1</precision>
<set_empty_string>N</set_empty_string>
</field>
<field>
<name>Description</name>
<type>String</type>
<format/>
<currency/>
<decimal/>
<group/>
<length>-1</length>
<precision>-1</precision>
<set_empty_string>N</set_empty_string>
</field>
<field>
<name>Type</name>
<type>String</type>
<format/>
<currency/>
<decimal/>
<group/>
<length>-1</length>
<precision>-1</precision>
<set_empty_string>N</set_empty_string>
</field>
<field>
<name>M1_S1</name>
<type/>
<format/>
<currency/>
<decimal/>
<group/>
<length>-1</length>
<precision>-1</precision>
<set_empty_string>N</set_empty_string>
</field>
<field>
<name>M1_S2</name>
<type/>
<format/>
<currency/>
<decimal/>
<group/>
<length>-1</length>
<precision>-1</precision>
<set_empty_string>N</set_empty_string>
</field>
<field>
<name>M2_S1</name>
<type/>
<format/>
<currency/>
<decimal/>
<group/>
<length>-1</length>
<precision>-1</precision>
<set_empty_string>N</set_empty_string>
</field>
<field>
<name>M2_S2</name>
<type/>
<format/>
<currency/>
<decimal/>
<group/>
<length>-1</length>
<precision>-1</precision>
<set_empty_string>N</set_empty_string>
</field>
<field>
<name>M3_S1</name>
<type/>
<format/>
<currency/>
<decimal/>
<group/>
<length>-1</length>
<precision>-1</precision>
<set_empty_string>N</set_empty_string>
</field>
<field>
<name>M3_S2</name>
<type/>
<format/>
<currency/>
<decimal/>
<group/>
<length>-1</length>
<precision>-1</precision>
<set_empty_string>N</set_empty_string>
</field>
</fields>
<data>
<line>
<item/>
<item/>
<item/>
<item>M1</item>
<item/>
<item>M2</item>
<item/>
<item>M3</item>
<item/>
</line>
<line>
<item/>
<item/>
<item/>
<item>S1</item>
<item>S2</item>
<item>S1</item>
<item>S2</item>
<item>S1</item>
<item>S2</item>
</line>
<line>
<item>P1</item>
<item>D1</item>
<item>T1</item>
<item>0</item>
<item>1</item>
<item>2</item>
<item>3</item>
<item>4</item>
<item>5</item>
</line>
<line>
<item>P2</item>
<item>D2</item>
<item>T2</item>
<item>6</item>
<item>7</item>
<item>8</item>
<item>9</item>
<item>10</item>
<item>11</item>
</line>
<line>
<item>P3</item>
<item>D3</item>
<item>T3</item>
<item>12</item>
<item>13</item>
<item>14</item>
<item>15</item>
<item>16</item>
<item>17</item>
</line>
</data>
<cluster_schema/>
<remotesteps>
<input>
</input>
<output>
</output>
</remotesteps>
<GUI>
<xloc>32</xloc>
<yloc>32</yloc>
<draw>Y</draw>
</GUI>
</step>
<step_error_handling>
</step_error_handling>
<slave-step-copy-partition-distribution>
</slave-step-copy-partition-distribution>
<slave_transformation>N</slave_transformation>
</transformation>
How can I make it so that the table becomes partially normalized, like:
M1 M2 M3
P1 D1 T1 S1 0 2 4
P1 D1 T1 S2 1 3 5
P2 D2 T2 S1 6 8 10
P2 D2 T2 S2 7 9 11
P3 D3 T3 S1 12 14 16
P3 D3 T3 S2 13 15 17
Do I have to first merge the two headers, and then normalize + split values?
Further explanation:
becomes partially normalized, like means that the table isn't completely normalized, like:
Data
P1 D1 T1 M1 S1 0
P1 D1 T1 M1 S2 1
P1 D1 T1 M2 S1 2
P1 D1 T1 M2 S2 3
P1 D1 T1 M3 S1 4
P1 D1 T1 M3 S2 5
P2 D2 T2 M1 S1 6
P2 D2 T2 M1 S2 7
P2 D2 T2 M2 S1 8
P2 D2 T2 M2 S2 9
P2 D2 T2 M3 S1 10
P2 D2 T2 M3 S2 11
P3 D3 T3 M1 S1 12
P3 D3 T3 M1 S2 13
P3 D3 T3 M2 S1 14
P3 D3 T3 M2 S2 15
P3 D3 T3 M3 S1 16
P3 D3 T3 M3 S2 17
merging the headers would be having a single header cell with the value of M2_S1 for example, after merging the headers it's rather trivial to normalize. However one would need to split the value of those header cells, as in M2_S1 to M2 and S1.
I am expecting an answer in the form of pentaho-spoon steps, and an explanation of what is being done and why it is being done in each step.

Related

How to update empty XML node in MSSQL

I want to update empty XML variable in MSSQL
i try every solutions like replace, outer apply , modify but nothing working
please help me find a solutions
<ArrayOfField xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema">
<Field>
<FieldType>1</FieldType>
<DataType>1</DataType>
<Label>department</Label>
<Name>department</Name>
<Description />
<Min />
<Max />
<DefaultValue />
<Required>0</Required>
<SelectItems />
<Formula />
<DataSourceUID xsi:nil="true" />
<TrueText />
<FalseText />
<UnconfirmtyRule xsi:nil="true" />
<UnconfirmtyRequired xsi:nil="true" />
<UnconfirmityFormUID xsi:nil="true" />
<ConfirmationStatus xsi:nil="true" />
<ConfirmationUserBasedType xsi:nil="true" />
<ConfirmationRightUsers />
<Newline>false</Newline>
<ColumnWith>12</ColumnWith>
<Offset>0</Offset>
</Field>
<Field>
<FieldType>1</FieldType>
<DataType>4</DataType>
<Label>Test</Label>
<Name>test</Name>
<Description />
<Min />
<Max />
<DefaultValue />
<Required>0</Required>
<SelectItems />
<Formula />
<TrueText />
<FalseText />
<UnconfirmtyRule xsi:nil="true" />
<UnconfirmtyRequired xsi:nil="true" />
<UnconfirmityFormUID xsi:nil="true" />
<ConfirmationStatus xsi:nil="true" />
<ConfirmationUserBasedType xsi:nil="true" />
<ConfirmationRightUsers />
<Newline>false</Newline>
<ColumnWith>12</ColumnWith>
<Offset>0</Offset>
</Field>
</ArrayOfField>
I want update DefaultValue if DataType = 1
New DefaultValue like <DefaultValue>TODAY()<DefaultValue/>
I try this code but not update anything what is my mistake?
DECLARE #SearchType NVARCHAR(100)=N'1';
DECLARE #ReplaceWith NVARCHAR(100)=N'TODAY()';
UPDATE FormSchema
SET Fields.modify('replace value of
(/ArrayOfField
/Field[DataType=sql:variable("#SearchType")]
/DefaultValue/text())[1]
with sql:variable("#ReplaceWith")')
WHERE Fields.exist('/ArrayOfField
/Field[DataType=sql:variable("#SearchType")]')=1;
You need to use different XML modify operations depending on whether the target element is empty or not (i.e.: it contains no text as opposed to being xsi:nil="true"). For example...
--
-- Setup data...
--
create table dbo.FormSchema (
Fields xml
);
insert dbo.FormSchema (Fields) values
(N'<ArrayOfField>
<Field>
<DataType>1</DataType>
<DefaultValue>Hello, world!</DefaultValue>
</Field>
<Field>
<DataType>2</DataType>
<DefaultValue />
</Field>
</ArrayOfField>'),
(N'<ArrayOfField>
<Field>
<DataType>1</DataType>
<DefaultValue />
</Field>
<Field>
<DataType>3</DataType>
<DefaultValue />
</Field>
</ArrayOfField>');
--
-- Perform updates...
--
DECLARE
#SearchType NVARCHAR(100) = N'1',
#ReplaceWith NVARCHAR(100) = N'TODAY()';
UPDATE dbo.FormSchema
SET Fields.modify('
replace value of (/ArrayOfField/Field[DataType=sql:variable("#SearchType")]/DefaultValue/text())[1]
with sql:variable("#ReplaceWith")
')
WHERE Fields.exist('(/ArrayOfField/Field[DataType=sql:variable("#SearchType")]/DefaultValue/text())[1]')=1;
UPDATE dbo.FormSchema
SET Fields.modify('
insert text{sql:variable("#ReplaceWith")}
into (/ArrayOfField/Field[DataType=sql:variable("#SearchType")]/DefaultValue)[1]
')
WHERE Fields.exist('(/ArrayOfField/Field[DataType=sql:variable("#SearchType")]/DefaultValue)[1]')=1
AND Fields.exist('(/ArrayOfField/Field[DataType=sql:variable("#SearchType")]/DefaultValue/text())[1]')=0;
--
-- Check results...
--
SELECT * FROM dbo.FormSchema;
Which yields the results:
Fields
<ArrayOfField><Field><DataType>1</DataType><DefaultValue>TODAY()</DefaultValue></Field><Field><DataType>2</DataType><DefaultValue /></Field></ArrayOfField>
<ArrayOfField><Field><DataType>1</DataType><DefaultValue>TODAY()</DefaultValue></Field><Field><DataType>3</DataType><DefaultValue /></Field></ArrayOfField>

Updating multiple XML columns using single update in SQL Server

I would like to update a table_A with new values for (AcresDist1, txtAcresDist1, txtAcresDist1Total) XML columns from a second table_B.
The column P_XML is of XML type. I know how to update a single column at once, but I would like to know how to update multiple columns in the XML using a single update statement. Thanks
SQL code:
UPDATE S
SET
P_XML.modify( N'replace value of (/FormValue/f1152_F1/Field[(id/text())[1]="AcresDist1"]/value/text())[1] with sql:column(''T.AcresDist1'')' )
, P_XML.modify( N'replace value of (/FormValue/f1152_F1/Field[(id/text())[1]="txtAcresDist1"]/value/text())[1] with sql:column(''T.txtAcresDist'')' )
, P_XML.modify( N'replace value of (/FormValue/f1152_F1/Field[(id/text())[1]="txtAcresDist1Total"]/value/text())[1] with sql:column(''T.txtAcresDist1Total'')' )
FROM
Table_A AS S
INNER JOIN
Table_B AS T ON s.P_NO = t.P_Number
AND s.FAC_RID = t.Fac_RID;
Here is the sample xml as requested. Thank you.
<FormValue>
<f1152>
<field>
<id>f1152_MainForm</id>
<value />
<tag />
<visible>true</visible>
<history>|09/28/2017 10:50:26 AM||</history>
<description />
<comment />
</field>
<field>
<id>txt_rdoCoverage</id>
<value>Development</value>
<tag />
<visible>false</visible>
<history>|09/28/2017 10:50:26 AM||</history>
<description />
<comment />
</field>
</f1152>
<f1152_F1>
<field>
<id>txtAcresDist1</id>
<value>1.2</value>
<tag />
<visible>false</visible>
<history>|09/28/2017 3:08:14 AM||</history>
<description />
<comment />
</field>
<field>
<id>txtAcresDist1Total</id>
<value>200</value>
<tag />
<visible>false</visible>
<history>|09/28/2017 3:08:14 AM||</history>
<description />
<comment />
</field>
</f1152_F1>

Pivot XML into SQL Columns & values

I have an XML file in the following format. Not every field name will have a value. Each field except for the id field will be varchar(40).
<index>
<doc id="0">
<field name="MFG">
<val>ACME</val>
</field>
<field name="InternalCode">
<val />
</field>
<field name="partnumber">
<val>012345-00</val>
</field>
<field name="partdescription">
<val>PIN</val>
</field>
</doc>
<doc id="1">
<field name="MFG">
<val />
</field>
<field name="InternalCode">
<val>ABCDE</val>
</field>
<field name="partnumber">
<val>919-555-7Z</val>
</field>
<field name="partdescription">
<val>WASHER</val>
</field>
</doc>
<doc id="2">
<field name="MFG">
<val>YOUR COMPANY</val>
</field>
<field name="InternalCode">
<val />
</field>
<field name="partnumber">
<val>131415</val>
</field>
<field name="partdescription">
<val>BOLT</val>
</field>
</doc>
</index>
What I would like to do is to read the XML & populate a table in SQL in the following manner.
In other words, after the rowid, pivot the rest of the attributes as columns and their values as the column value. I'm using the following code that will list the rowid, attribute & their values as rows.
SELECT XMLAttribute.rowid, XMLAttribute.name, XMLAttribute.val
FROM OPENXML (#hdoc, 'index/doc/field', 2 )
WITH (rowid int '../#id',
name VARCHAR(128) '#name',
val varchar(128) 'val'
) AS XMLAttribute
Can this (pivot after the rowid) be done? If so, how?
You can better do this with XPath/XQuery than with OPENXML. Check out documentation on XML.nodes() and XML.value(). Check out some XPath guide online, this is a good one.
DECLARE #i XML=
'<index>
<doc id="0"><field name="MFG"><val>ACME</val></field><field name="InternalCode"><val /></field><field name="partnumber"><val>012345-00</val></field><field name="partdescription"><val>PIN</val></field></doc>
<doc id="1"><field name="MFG"><val /></field><field name="InternalCode"><val>ABCDE</val></field><field name="partnumber"><val>919-555-7Z</val></field><field name="partdescription"><val>WASHER</val></field></doc>
<doc id="2"><field name="MFG"><val>YOUR COMPANY</val></field><field name="InternalCode"><val /></field><field name="partnumber"><val>131415</val></field><field name="partdescription"><val>BOLT</val></field></doc>
</index>';
SELECT
rowid=n.v.value('#id','VARCHAR(40)'),
MFG=n.v.value('(field[#name="MFG"]/val)[1]','VARCHAR(40)'),
InternalCode=n.v.value('(field[#name="InternalCode"]/val)[1]','VARCHAR(40)'),
partnumber=n.v.value('(field[#name="partnumber"]/val)[1]','VARCHAR(40)'),
partdescription=n.v.value('(field[#name="partdescription"]/val)[1]','VARCHAR(40)')
FROM
#i.nodes('/index/doc') AS n(v);
Result:
+-------+--------------+--------------+------------+-----------------+
| rowid | MFG | InternalCode | partnumber | partdescription |
+-------+--------------+--------------+------------+-----------------+
| 0 | ACME | | 012345-00 | PIN |
| 1 | | ABCDE | 919-555-7Z | WASHER |
| 2 | YOUR COMPANY | | 131415 | BOLT |
+-------+--------------+--------------+------------+-----------------+

Solr field depth

How would one set up Solr such that we have "child" node fields?
For example, for this doc, there exists 2 cars, but each car has a subset of colors.
For example:
<doc>
<field name = "make"> Toyota </field>
<field name = "car"> Camri </field>
<field name = "color"> Silver </field>
<field name = "color"> Red </field>
<field name = "car"> Corolla </field>
<field name = "color"> Blue </field>
<field name = "color"> Red </field>
<doc>
How would one go about getting these relationships indexed?
The general practice is to denormalize the database as Solr works with a plain schema. For example, you can make a multi-valued field and put these values into it:
Camri/Silver
Camri/Red
Corolla/Blue
Corolla/Red

Full-import failing when using CachedSqlEntityProcessor giving OutOfMemoryError Exception

Full-import failing when using CachedSqlEntityProcessor giving Exception
java.lang.OutOfMemoryError: GC overhead limit exceeded
How can i resolve this Issue.......
Without using CachedSqlEntityProcessor it is taking 15 hrs to index
and My products-data-config.xml is
<dataConfig>
<dataSource type="JdbcDataSource" driver="com.mysql.jdbc.Driver" url="jdbc:mysql://localhost:3306/localbazaar" user="root" password="sa" batchSize="100" />
<document name="products">
<entity name="domainProduct" query="SELECT p.PRODUCT_ID, p.NAME, LOWER(REPLACE(REPLACE(p.NAME,' ','-'),'/','-')) AS purl, p.description, p.BRAND_ID, p.CATEGORY_ID, p.GROUP_ID, p.MIN_PRICE, p.MAX_PRICE, p.AUTHOR, p.ISBN10, p.ISBN13, p.OLID, p.EAN13, p.UPCA, p.SKU, p.LANGUAGE, p.FORMAT, p.PUBLISHER, p.SUBJECT, c.NAME AS cname, c.URL_NAME, b.NAME AS bname, LOWER(REPLACE(REPLACE(b.NAME,' ','-'),'/','-')) AS bUrl, CONCAT('http://partnercenter.localbazaar.com/image?imageId=',i.IMAGE_NAME) AS productImage FROM product_t p LEFT OUTER JOIN category_t c ON (c.CATEGORY_ID=p.CATEGORY_ID) LEFT OUTER JOIN brand_t b ON (b.BRAND_ID=p.BRAND_ID) LEFT OUTER JOIN image_t i ON (i.ASSET_ID=p.PRODUCT_ID AND i.ASSET_TYPE_ID = 4 AND i.IMAGE_TYPE_ID = 0)">
<field column="PRODUCT_ID" name="productId" />
<field column="NAME" name="productName" />
<field column="purl" name="productUrlName" />
<field column="description" name="productDescription" />
<field column="BRAND_ID" name="brandId" />
<field column="CATEGORY_ID" name="categoryId" />
<field column="GROUP_ID" name="groupId" />
<field column="MIN_PRICE" name="minPrice" />
<field column="MAX_PRICE" name="maxPrice" />
<field column="AUTHOR" name="author" />
<field column="ISBN10" name="isbn10" />
<field column="ISBN13" name="isbn13" />
<field column="OLID" name="olid" />
<field column="EAN13" name="ean13" />
<field column="UPCA" name="upca" />
<field column="SKU" name="sku" />
<field column="LANGUAGE" name="language" />
<field column="FORMAT" name="format" />
<field column="PUBLISHER" name="publisher" />
<field column="SUBJECT" name="subject" />
<field column="cname" name="categoryName" />
<field column="URL_NAME" name="categoryUrlName" />
<field column="bname" name="brandName" />
<field column="bUrl" name="brandUrlName" />
<field column="productImage" name="productImage" />
<entity name="specifications" query="select PRODUCT_ID, CONCAT(PROPERTY_NAME,':::',property_value) as specifications FROM product_properties_t " processor="CachedSqlEntityProcessor" where="PRODUCT_ID=domainProduct.PRODUCT_ID" />
</entity>
</document>
</dataConfig>
and My store-products-data-config.xml is
<dataConfig>
<dataSource type="JdbcDataSource" driver="com.mysql.jdbc.Driver" url="jdbc:mysql://localhost:3306/localbazaar" user="root" password="sa" batchSize="100" />
<document name="products">
<entity name="domainStoreProduct" query="SELECT sp.STORE_PRODUCT_ID, sp.STORE_ID, sp.PRODUCT_ID, sp.MIN_PRICE, sp.MAX_PRICE, sp.STORE_TYPE_ID, sp.BUY_X, sp.GET_Y, s.NAME AS sname, LOWER(REPLACE(REPLACE(s.NAME,' ','-'),'/','-')) AS sUrl, s.DESCRIPTION AS sdesc, s.WEB_SITE_UTL, s.EMAIL, s.PHONE, s.MOBILE, s.ACTIVE AS act, a.ADDRESS_ID, a.location, LOWER(REPLACE(REPLACE(a.location,' ','-'),'/','-')) AS urlLoc, a.ADDRESS_LINE1, a.ADDRESS_LINE2, a.LATITUDE, a.LONGITUDE, a.zipcode, a.LANDMARK, a.CITY, CONCAT(a.LATITUDE,',',a.LONGITUDE) AS ll, p.NAME AS pname, LOWER(REPLACE(REPLACE(p.NAME,' ','-'),'/','-')) AS purl, p.description AS pdesc, p.BRAND_ID, p.CATEGORY_ID, p.GROUP_ID, p.AUTHOR, p.ISBN10, p.ISBN13, p.OLID, p.EAN13, p.UPCA, p.SKU, p.LANGUAGE, p.FORMAT, p.PUBLISHER, p.SUBJECT, c.NAME AS cname, c.URL_NAME, b.NAME AS bname, LOWER(REPLACE(REPLACE(b.NAME,' ','-'),'/','-')) AS bUrl, CONCAT('http://partnercenter.localbazaar.com/image?imageId=',ip.IMAGE_NAME) AS pImage, CONCAT('http://partnercenter.localbazaar.com/image?imageId=',ist.IMAGE_NAME) AS sImage, ci.CITY_ID FROM store_products_t sp LEFT OUTER JOIN store_t s ON (sp.STORE_ID=s.STORE_ID) LEFT OUTER JOIN address_t a ON (a.ASSET_TYPE_ID=3 AND a.ASSET_ID=sp.STORE_ID) LEFT OUTER JOIN product_t p ON (p.PRODUCT_ID=sp.PRODUCT_ID) LEFT OUTER JOIN category_t c ON (c.CATEGORY_ID=p.CATEGORY_ID) LEFT OUTER JOIN brand_t b ON (b.BRAND_ID=p.BRAND_ID) LEFT OUTER JOIN image_t ip ON (ip.ASSET_ID=sp.PRODUCT_ID AND ip.ASSET_TYPE_ID=4 AND ip.IMAGE_TYPE_ID=0) LEFT OUTER JOIN image_t ist ON (ist.ASSET_ID=sp.STORE_ID AND ist.ASSET_TYPE_ID=3 AND ist.IMAGE_TYPE_ID=0) LEFT OUTER JOIN city_t ci ON (ci.NAME=a.CITY)">
<field column="STORE_PRODUCT_ID" name="storeProductId" />
<field column="STORE_ID" name="storeId" />
<field column="PRODUCT_ID" name="productId" />
<field column="MIN_PRICE" name="storeMinPrice" />
<field column="MAX_PRICE" name="storeMaxPrice" />
<field column="STORE_TYPE_ID" name="storeTypeId" />
<field column="BUY_X" name="buyX" />
<field column="GET_Y" name="getY" />
<field column="sname" name="storeName" />
<field column="sUrl" name="storeUrlName" />
<field column="sdesc" name="description" />
<field column="WEB_SITE_UTL" name="webSiteUrl" />
<field column="EMAIL" name="email" />
<field column="PHONE" name="phone" />
<field column="MOBILE" name="mobile" />
<field column="act" name="active" />
<field column="ADDRESS_ID" name="addressId" />
<field column="location" name="location" />
<field column="urlLoc" name="urlLocation" />
<field column="ADDRESS_LINE1" name="addressLine1" />
<field column="ADDRESS_LINE2" name="addressLine2" />
<field column="LATITUDE" name="latitude" />
<field column="LONGITUDE" name="longitude" />
<field column="zipcode" name="zipcode" />
<field column="LANDMARK" name="landmark" />
<field column="CITY" name="city" />
<field column="ll" name="latlong" />
<field column="pname" name="productName" />
<field column="purl" name="productUrlName" />
<field column="pdesc" name="productDescription" />
<field column="BRAND_ID" name="brandId" />
<field column="CATEGORY_ID" name="categoryId" />
<field column="GROUP_ID" name="groupId" />
<field column="AUTHOR" name="author" />
<field column="ISBN10" name="isbn10" />
<field column="ISBN13" name="isbn13" />
<field column="OLID" name="olid" />
<field column="EAN13" name="ean13" />
<field column="UPCA" name="upca" />
<field column="SKU" name="sku" />
<field column="LANGUAGE" name="language" />
<field column="FORMAT" name="format" />
<field column="PUBLISHER" name="publisher" />
<field column="SUBJECT" name="subject" />
<field column="cname" name="categoryName" />
<field column="URL_NAME" name="categoryUrlName" />
<field column="bname" name="brandName" />
<field column="bUrl" name="brandUrlName" />
<field column="pImage" name="productImage" />
<field column="sImage" name="storeImage" />
<field column="CITY_ID" name="cityId" />
<entity name="specifications" query="select PRODUCT_ID, CONCAT(PROPERTY_NAME,':::',property_value) as specifications FROM product_properties_t " processor="CachedSqlEntityProcessor" WHERE="PRODUCT_ID= domainStoreProduct.PRODUCT_ID" />
<entity name="storeProperties" query="select STORE_ID, CONCAT(PROPERTY_ID,':::',PROPERTY_VALUE) as storeProperties FROM store_properties_t " processor="CachedSqlEntityProcessor" WHERE="STORE_ID=domainStoreProduct.STORE_ID" />
</entity>
</document>
</dataConfig>
You can try different things:
Try setting the batchSize property. If you tune it correctly, you can increase the performance of your datasource.
SELECT * is ALWAYS slower than providing the columns you need (even if you need all columns). I would suggest using SELECT PRODUCT_ID, NAME, ... in stead of using *
Why do you have the entities b, i and s? You don't use the fields from it, so I don't think they're very useful
Try using the CachedSqlEntityProcessor for your sub-entities. It will only retrieve the data once and re-use it for each subenttiy.
Can your product belong to more than 1 category (is it a multivalued field?), if not, then writing one query using JOINS is faster than writing multiple entities.
EDIT: I suggest seperating this thing into 2 questions because now it's really weird for other people to read your new question with my old answer.
I don't think you can choose where the CachedSqlEntityProcessor will put his cache (it's always in memory I think). The problem with your 8 hours of data import is that, because we're talking about a lot of records, a lot of queries will be used (every subentity uses its own query).
The solution to your problem is to remove the subentity and in your parent entity add the query of your subentity as a comma seperated list. I suggest looking at this answer.
If you do this, all your specifications (for examples) will be stored inside one column as a comma speerated list. You can then use a Solr ScriptTransformer to split the values and create multiple values.
This limits the number of queries to 1 big query and will also limit the use of RAM since it will parse each query individually. I have no clue what the performance will be, because you will have to parse each entity individually.
If this doesn't work I don't think there is a better solution than to wait 8 hours for the data import to complete. You can't expect that Solr will index it all in 1 2 3. You can try using a cronjob to run this task over night.

Resources