Modul Wissenbasis
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

932 lines
24 KiB

<?xml version="1.0" encoding="UTF-8"?><transformation>
<info>
<name>wiki2xhtml</name>
<description/>
<extended_description/>
<trans_version/>
<trans_type>Normal</trans_type>
<trans_status>0</trans_status>
<directory>/</directory>
<parameters>
<parameter>
<name>document_id</name>
<default_value>68</default_value>
<description/>
</parameter>
</parameters>
<log>
<trans-log-table>
<connection/>
<schema/>
<table/>
<size_limit_lines/>
<interval/>
<timeout_days/>
<field>
<id>ID_BATCH</id>
<enabled>Y</enabled>
<name>ID_BATCH</name>
</field>
<field>
<id>CHANNEL_ID</id>
<enabled>Y</enabled>
<name>CHANNEL_ID</name>
</field>
<field>
<id>TRANSNAME</id>
<enabled>Y</enabled>
<name>TRANSNAME</name>
</field>
<field>
<id>STATUS</id>
<enabled>Y</enabled>
<name>STATUS</name>
</field>
<field>
<id>LINES_READ</id>
<enabled>Y</enabled>
<name>LINES_READ</name>
<subject/>
</field>
<field>
<id>LINES_WRITTEN</id>
<enabled>Y</enabled>
<name>LINES_WRITTEN</name>
<subject/>
</field>
<field>
<id>LINES_UPDATED</id>
<enabled>Y</enabled>
<name>LINES_UPDATED</name>
<subject/>
</field>
<field>
<id>LINES_INPUT</id>
<enabled>Y</enabled>
<name>LINES_INPUT</name>
<subject/>
</field>
<field>
<id>LINES_OUTPUT</id>
<enabled>Y</enabled>
<name>LINES_OUTPUT</name>
<subject/>
</field>
<field>
<id>LINES_REJECTED</id>
<enabled>Y</enabled>
<name>LINES_REJECTED</name>
<subject/>
</field>
<field>
<id>ERRORS</id>
<enabled>Y</enabled>
<name>ERRORS</name>
</field>
<field>
<id>STARTDATE</id>
<enabled>Y</enabled>
<name>STARTDATE</name>
</field>
<field>
<id>ENDDATE</id>
<enabled>Y</enabled>
<name>ENDDATE</name>
</field>
<field>
<id>LOGDATE</id>
<enabled>Y</enabled>
<name>LOGDATE</name>
</field>
<field>
<id>DEPDATE</id>
<enabled>Y</enabled>
<name>DEPDATE</name>
</field>
<field>
<id>REPLAYDATE</id>
<enabled>Y</enabled>
<name>REPLAYDATE</name>
</field>
<field>
<id>LOG_FIELD</id>
<enabled>Y</enabled>
<name>LOG_FIELD</name>
</field>
<field>
<id>EXECUTING_SERVER</id>
<enabled>N</enabled>
<name>EXECUTING_SERVER</name>
</field>
<field>
<id>EXECUTING_USER</id>
<enabled>N</enabled>
<name>EXECUTING_USER</name>
</field>
<field>
<id>CLIENT</id>
<enabled>N</enabled>
<name>CLIENT</name>
</field>
</trans-log-table>
<perf-log-table>
<connection/>
<schema/>
<table/>
<interval/>
<timeout_days/>
<field>
<id>ID_BATCH</id>
<enabled>Y</enabled>
<name>ID_BATCH</name>
</field>
<field>
<id>SEQ_NR</id>
<enabled>Y</enabled>
<name>SEQ_NR</name>
</field>
<field>
<id>LOGDATE</id>
<enabled>Y</enabled>
<name>LOGDATE</name>
</field>
<field>
<id>TRANSNAME</id>
<enabled>Y</enabled>
<name>TRANSNAME</name>
</field>
<field>
<id>STEPNAME</id>
<enabled>Y</enabled>
<name>STEPNAME</name>
</field>
<field>
<id>STEP_COPY</id>
<enabled>Y</enabled>
<name>STEP_COPY</name>
</field>
<field>
<id>LINES_READ</id>
<enabled>Y</enabled>
<name>LINES_READ</name>
</field>
<field>
<id>LINES_WRITTEN</id>
<enabled>Y</enabled>
<name>LINES_WRITTEN</name>
</field>
<field>
<id>LINES_UPDATED</id>
<enabled>Y</enabled>
<name>LINES_UPDATED</name>
</field>
<field>
<id>LINES_INPUT</id>
<enabled>Y</enabled>
<name>LINES_INPUT</name>
</field>
<field>
<id>LINES_OUTPUT</id>
<enabled>Y</enabled>
<name>LINES_OUTPUT</name>
</field>
<field>
<id>LINES_REJECTED</id>
<enabled>Y</enabled>
<name>LINES_REJECTED</name>
</field>
<field>
<id>ERRORS</id>
<enabled>Y</enabled>
<name>ERRORS</name>
</field>
<field>
<id>INPUT_BUFFER_ROWS</id>
<enabled>Y</enabled>
<name>INPUT_BUFFER_ROWS</name>
</field>
<field>
<id>OUTPUT_BUFFER_ROWS</id>
<enabled>Y</enabled>
<name>OUTPUT_BUFFER_ROWS</name>
</field>
</perf-log-table>
<channel-log-table>
<connection/>
<schema/>
<table/>
<timeout_days/>
<field>
<id>ID_BATCH</id>
<enabled>Y</enabled>
<name>ID_BATCH</name>
</field>
<field>
<id>CHANNEL_ID</id>
<enabled>Y</enabled>
<name>CHANNEL_ID</name>
</field>
<field>
<id>LOG_DATE</id>
<enabled>Y</enabled>
<name>LOG_DATE</name>
</field>
<field>
<id>LOGGING_OBJECT_TYPE</id>
<enabled>Y</enabled>
<name>LOGGING_OBJECT_TYPE</name>
</field>
<field>
<id>OBJECT_NAME</id>
<enabled>Y</enabled>
<name>OBJECT_NAME</name>
</field>
<field>
<id>OBJECT_COPY</id>
<enabled>Y</enabled>
<name>OBJECT_COPY</name>
</field>
<field>
<id>REPOSITORY_DIRECTORY</id>
<enabled>Y</enabled>
<name>REPOSITORY_DIRECTORY</name>
</field>
<field>
<id>FILENAME</id>
<enabled>Y</enabled>
<name>FILENAME</name>
</field>
<field>
<id>OBJECT_ID</id>
<enabled>Y</enabled>
<name>OBJECT_ID</name>
</field>
<field>
<id>OBJECT_REVISION</id>
<enabled>Y</enabled>
<name>OBJECT_REVISION</name>
</field>
<field>
<id>PARENT_CHANNEL_ID</id>
<enabled>Y</enabled>
<name>PARENT_CHANNEL_ID</name>
</field>
<field>
<id>ROOT_CHANNEL_ID</id>
<enabled>Y</enabled>
<name>ROOT_CHANNEL_ID</name>
</field>
</channel-log-table>
<step-log-table>
<connection/>
<schema/>
<table/>
<timeout_days/>
<field>
<id>ID_BATCH</id>
<enabled>Y</enabled>
<name>ID_BATCH</name>
</field>
<field>
<id>CHANNEL_ID</id>
<enabled>Y</enabled>
<name>CHANNEL_ID</name>
</field>
<field>
<id>LOG_DATE</id>
<enabled>Y</enabled>
<name>LOG_DATE</name>
</field>
<field>
<id>TRANSNAME</id>
<enabled>Y</enabled>
<name>TRANSNAME</name>
</field>
<field>
<id>STEPNAME</id>
<enabled>Y</enabled>
<name>STEPNAME</name>
</field>
<field>
<id>STEP_COPY</id>
<enabled>Y</enabled>
<name>STEP_COPY</name>
</field>
<field>
<id>LINES_READ</id>
<enabled>Y</enabled>
<name>LINES_READ</name>
</field>
<field>
<id>LINES_WRITTEN</id>
<enabled>Y</enabled>
<name>LINES_WRITTEN</name>
</field>
<field>
<id>LINES_UPDATED</id>
<enabled>Y</enabled>
<name>LINES_UPDATED</name>
</field>
<field>
<id>LINES_INPUT</id>
<enabled>Y</enabled>
<name>LINES_INPUT</name>
</field>
<field>
<id>LINES_OUTPUT</id>
<enabled>Y</enabled>
<name>LINES_OUTPUT</name>
</field>
<field>
<id>LINES_REJECTED</id>
<enabled>Y</enabled>
<name>LINES_REJECTED</name>
</field>
<field>
<id>ERRORS</id>
<enabled>Y</enabled>
<name>ERRORS</name>
</field>
<field>
<id>LOG_FIELD</id>
<enabled>N</enabled>
<name>LOG_FIELD</name>
</field>
</step-log-table>
<metrics-log-table>
<connection/>
<schema/>
<table/>
<timeout_days/>
<field>
<id>ID_BATCH</id>
<enabled>Y</enabled>
<name>ID_BATCH</name>
</field>
<field>
<id>CHANNEL_ID</id>
<enabled>Y</enabled>
<name>CHANNEL_ID</name>
</field>
<field>
<id>LOG_DATE</id>
<enabled>Y</enabled>
<name>LOG_DATE</name>
</field>
<field>
<id>METRICS_DATE</id>
<enabled>Y</enabled>
<name>METRICS_DATE</name>
</field>
<field>
<id>METRICS_CODE</id>
<enabled>Y</enabled>
<name>METRICS_CODE</name>
</field>
<field>
<id>METRICS_DESCRIPTION</id>
<enabled>Y</enabled>
<name>METRICS_DESCRIPTION</name>
</field>
<field>
<id>METRICS_SUBJECT</id>
<enabled>Y</enabled>
<name>METRICS_SUBJECT</name>
</field>
<field>
<id>METRICS_TYPE</id>
<enabled>Y</enabled>
<name>METRICS_TYPE</name>
</field>
<field>
<id>METRICS_VALUE</id>
<enabled>Y</enabled>
<name>METRICS_VALUE</name>
</field>
</metrics-log-table>
</log>
<maxdate>
<connection/>
<table/>
<field/>
<offset>0.0</offset>
<maxdiff>0.0</maxdiff>
</maxdate>
<size_rowset>10000</size_rowset>
<sleep_time_empty>50</sleep_time_empty>
<sleep_time_full>50</sleep_time_full>
<unique_connections>N</unique_connections>
<feedback_shown>Y</feedback_shown>
<feedback_size>50000</feedback_size>
<using_thread_priorities>Y</using_thread_priorities>
<shared_objects_file/>
<capture_step_performance>N</capture_step_performance>
<step_performance_capturing_delay>1000</step_performance_capturing_delay>
<step_performance_capturing_size_limit>100</step_performance_capturing_size_limit>
<dependencies/>
<partitionschemas/>
<slaveservers/>
<clusterschemas/>
<created_user>-</created_user>
<created_date>2017/08/22 11:23:11.075</created_date>
<modified_user>-</modified_user>
<modified_date>2017/08/22 11:23:11.075</modified_date>
<key_for_session_key>H4sIAAAAAAAAAAMAAAAAAAAAAAA=</key_for_session_key>
<is_key_private>N</is_key_private>
</info>
<notepads/>
<order>
<hop>
<from>wiki2html</from>
<to>Update src_xml</to>
<enabled>Y</enabled>
</hop>
<hop>
<from>getSrcTitle</from>
<to>DownloadSrc</to>
<enabled>Y</enabled>
</hop>
<hop>
<from>saveSrc</from>
<to>Block this step until steps finish</to>
<enabled>Y</enabled>
</hop>
<hop>
<from>Block this step until steps finish</from>
<to>wiki2html</to>
<enabled>Y</enabled>
</hop>
<hop>
<from>DownloadSrc</from>
<to>save src_api_xml</to>
<enabled>Y</enabled>
</hop>
<hop>
<from>save src_api_xml</from>
<to>apiDownload2src</to>
<enabled>Y</enabled>
</hop>
<hop>
<from>apiDownload2src</from>
<to>saveSrc</to>
<enabled>Y</enabled>
</hop>
</order>
<step>
<name>Block this step until steps finish</name>
<type>BlockUntilStepsFinish</type>
<description/>
<distribute>Y</distribute>
<custom_distribution/>
<copies>1</copies>
<partitioning>
<method>none</method>
<schema_name/>
</partitioning>
<steps>
<step>
<name>saveSrc</name>
<CopyNr>0</CopyNr>
</step>
</steps>
<cluster_schema/>
<remotesteps>
<input/>
<output/>
</remotesteps>
<GUI>
<xloc>384</xloc>
<yloc>288</yloc>
<draw>Y</draw>
</GUI>
</step>
<step>
<name>DownloadSrc</name>
<type>UserDefinedJavaClass</type>
<description/>
<distribute>Y</distribute>
<custom_distribution/>
<copies>1</copies>
<partitioning>
<method>none</method>
<schema_name/>
</partitioning>
<definitions>
<definition>
<class_type>TRANSFORM_CLASS</class_type>
<class_name>Processor</class_name>
<class_source>import org.apache.commons.httpclient.Cookie;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpState;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;
import java.io.InputStream;
import java.io.IOException;
import java.net.URLEncoder;
public boolean processRow(StepMetaInterface smi, StepDataInterface sdi) throws java.io.IOException,KettleException {
Object[] r = getRow();
if (r == null) {
setOutputDone();
return false;
}
String host=get(Fields.In, "hostname").getString(r);// e.g."wiki.his.de";
String api_path=get(Fields.In, "api_path").getString(r);// e.g."wiki.his.de";
String cookieName=get(Fields.In, "cookie_name").getString(r); //"wikidb_session";
String cookieValue=get(Fields.In, "cookie_value").getString(r);//"dr1brbd7saa7jismgsk2a3cf80"; //getParameter("cookie");
String title= get(Fields.In, "src_title").getString(r);
try{
title=URLEncoder.encode(title, "UTF-8");
}
catch (Exception e) {
}
String strURL="https://"+host+"/"+api_path+"/api.php?action=query&amp;titles="+title+"&amp;prop=revisions&amp;rvprop=content&amp;format=xml";
HttpState initialState = null;
initialState = authenticateWithCookie(host, cookieName, cookieValue);
Object[] outputRow = createOutputRow(r, data.outputRowMeta.size());
String responseBody ="";
//try {
responseBody = getPageContent(strURL, initialState);
//} catch (HttpException e) {
// TODO Auto-generated catch block
//get(Fields.Out, "src_text").setValue(outputRow,"Nicht lesbar: "+ strURL);
// e.printStackTrace();
// } catch (IOException e) {
// TODO Auto-generated catch block
//get(Fields.Out, "src_text").setValue(outputRow,"Nicht lesbar: "+ strURL);
// e.printStackTrace();
// }
get(Fields.Out, "src_api_xml").setValue(outputRow, responseBody);
putRow(data.outputRowMeta, outputRow);
return true;
}
public static HttpState authenticateWithCookie(String host, String cookieName, String cookieValue) {
HttpState initialState = new HttpState();
if(cookieName != null){
Cookie mycookie = new Cookie(host, cookieName, cookieValue, "/", null, false);
initialState.addCookie(mycookie);
}
return initialState;
}
public static String getPageContent(String strURL, HttpState initialState) throws IOException, HttpException {
int c;
String responseBodyString="";
HttpClient httpclient = new HttpClient();
httpclient.getHttpConnectionManager().
getParams().setConnectionTimeout(30000);
httpclient.setState(initialState);
httpclient.getParams().setCookiePolicy(CookiePolicy.RFC_2109);
GetMethod httpget = new GetMethod(strURL);
httpget.addRequestHeader("Content-Type","text/xml; charset=UTF-8");
// Execute HTTP GET
//der Inhalt steht in textarea id=wpTextbox1
int result = httpclient.executeMethod(httpget);
//System.out.println("Response status code: " + result);
InputStream responseBody = null;
//responseBody = httpget.getResponseBodyAsStream(); //getResponseBodyAsString();
responseBodyString=httpget.getResponseBodyAsString();
/*while ((c = responseBody.read()) != -1) {
responseBodyString+= (char) c;
}*/
httpget.releaseConnection();
return responseBodyString;
}
</class_source>
</definition>
</definitions>
<fields/>
<clear_result_fields>N</clear_result_fields>
<info_steps/>
<target_steps/>
<usage_parameters/>
<cluster_schema/>
<remotesteps>
<input/>
<output/>
</remotesteps>
<GUI>
<xloc>48</xloc>
<yloc>176</yloc>
<draw>Y</draw>
</GUI>
</step>
<step>
<name>Update src_xml</name>
<type>Update</type>
<description/>
<distribute>Y</distribute>
<custom_distribution/>
<copies>1</copies>
<partitioning>
<method>none</method>
<schema_name/>
</partitioning>
<connection>eduetl</connection>
<skip_lookup>N</skip_lookup>
<commit>100</commit>
<use_batch>N</use_batch>
<error_ignored>N</error_ignored>
<ignore_flag_field/>
<lookup>
<schema/>
<table>kb_document_source</table>
<key>
<name>document_id</name>
<field>document_id</field>
<condition>=</condition>
<name2/>
</key>
<value>
<name>src_xml</name>
<rename>contentHtml</rename>
</value>
<value>
<name>last_input</name>
<rename>last_input</rename>
</value>
<value>
<name>src_headers</name>
<rename>headersStr</rename>
</value>
<value>
<name>src_header_levels</name>
<rename>headersLevelsStr</rename>
</value>
<value>
<name>internal_hyperlinks</name>
<rename>internalHyperlinksStr</rename>
</value>
<value>
<name>template_options</name>
<rename>templateOptionsStr</rename>
</value>
</lookup>
<cluster_schema/>
<remotesteps>
<input/>
<output/>
</remotesteps>
<GUI>
<xloc>560</xloc>
<yloc>144</yloc>
<draw>Y</draw>
</GUI>
</step>
<step>
<name>apiDownload2src</name>
<type>XSLT</type>
<description/>
<distribute>Y</distribute>
<custom_distribution/>
<copies>1</copies>
<partitioning>
<method>none</method>
<schema_name/>
</partitioning>
<xslfilename>${Internal.Transformation.Filename.Directory}/mediawikiapi_page2text.xsl</xslfilename>
<fieldname>src_api_xml</fieldname>
<resultfieldname>result_text</resultfieldname>
<xslfilefield/>
<xslfilefielduse>N</xslfilefielduse>
<xslfieldisafile>N</xslfieldisafile>
<xslfactory>JAXP</xslfactory>
<parameters/>
<outputproperties>
<outputproperty>
<name>method</name>
<value>text</value>
</outputproperty>
</outputproperties>
<cluster_schema/>
<remotesteps>
<input/>
<output/>
</remotesteps>
<GUI>
<xloc>240</xloc>
<yloc>224</yloc>
<draw>Y</draw>
</GUI>
</step>
<step>
<name>getSrcTitle</name>
<type>TableInput</type>
<description/>
<distribute>Y</distribute>
<custom_distribution/>
<copies>1</copies>
<partitioning>
<method>none</method>
<schema_name/>
</partitioning>
<connection>eduetl</connection>
<sql>select S.document_id, S.src_title, S.src_text,S.src_api_xml,
S.src_url,W.hostname,W.api_path,W.cookie_name, W.cookie_value, today() as last_input
from kb_document_source S, kb_webconnection W
where W.id=S.webconnection_id
and S.document_id=${document_id}</sql>
<limit>0</limit>
<lookup/>
<execute_each_row>N</execute_each_row>
<variables_active>Y</variables_active>
<lazy_conversion_active>N</lazy_conversion_active>
<cluster_schema/>
<remotesteps>
<input/>
<output/>
</remotesteps>
<GUI>
<xloc>64</xloc>
<yloc>48</yloc>
<draw>Y</draw>
</GUI>
</step>
<step>
<name>saveSrc</name>
<type>Update</type>
<description/>
<distribute>Y</distribute>
<custom_distribution/>
<copies>1</copies>
<partitioning>
<method>none</method>
<schema_name/>
</partitioning>
<connection>eduetl</connection>
<skip_lookup>N</skip_lookup>
<commit>100</commit>
<use_batch>N</use_batch>
<error_ignored>N</error_ignored>
<ignore_flag_field/>
<lookup>
<schema/>
<table>kb_document_source</table>
<key>
<name>document_id</name>
<field>document_id</field>
<condition>=</condition>
<name2/>
</key>
<value>
<name>src_text</name>
<rename>result_text</rename>
</value>
</lookup>
<cluster_schema/>
<remotesteps>
<input/>
<output/>
</remotesteps>
<GUI>
<xloc>336</xloc>
<yloc>208</yloc>
<draw>Y</draw>
</GUI>
</step>
<step>
<name>wiki2html</name>
<type>ScriptValueMod</type>
<description/>
<distribute>Y</distribute>
<custom_distribution/>
<copies>1</copies>
<partitioning>
<method>none</method>
<schema_name/>
</partitioning>
<compatible>N</compatible>
<optimizationLevel>9</optimizationLevel>
<jsScripts>
<jsScript>
<jsScript_type>0</jsScript_type>
<jsScript_name>Script 1</jsScript_name>
<jsScript_script>//Script here
// assuming, that the ua-parser.js is in the transformation directory:
var transformationPath = getVariable("Internal.Transformation.Filename.Directory", "");
var jsScriptPath = transformationPath + "/mwtools.js";
LoadScriptFile(jsScriptPath);
//alert("klappt");
var newWikiModel = new wikiModel(result_text);
//var contentHtml=mw2xhtml(result_text);
var contentHtml=newWikiModel.wikiHtml;
var headersStr=newWikiModel.headersStr;
var headersLevelsStr=newWikiModel.headersLevelsStr;
var internalHyperlinksStr=newWikiModel.internalHyperlinksStr;
var templateOptionsStr=newWikiModel.templateOptionsStr;
</jsScript_script>
</jsScript>
</jsScripts>
<fields>
<field>
<name>contentHtml</name>
<rename>contentHtml</rename>
<type>String</type>
<length>-1</length>
<precision>-1</precision>
<replace>N</replace>
</field>
<field>
<name>headersStr</name>
<rename>headersStr</rename>
<type>String</type>
<length>-1</length>
<precision>-1</precision>
<replace>N</replace>
</field>
<field>
<name>headersLevelsStr</name>
<rename>headersLevelsStr</rename>
<type>String</type>
<length>-1</length>
<precision>-1</precision>
<replace>N</replace>
</field>
<field>
<name>internalHyperlinksStr</name>
<rename>internalHyperlinksStr</rename>
<type>String</type>
<length>-1</length>
<precision>-1</precision>
<replace>N</replace>
</field>
<field>
<name>templateOptionsStr</name>
<rename>templateOptionsStr</rename>
<type>String</type>
<length>-1</length>
<precision>-1</precision>
<replace>N</replace>
</field>
</fields>
<cluster_schema/>
<remotesteps>
<input/>
<output/>
</remotesteps>
<GUI>
<xloc>528</xloc>
<yloc>288</yloc>
<draw>Y</draw>
</GUI>
</step>
<step>
<name>save src_api_xml</name>
<type>Update</type>
<description/>
<distribute>Y</distribute>
<custom_distribution/>
<copies>1</copies>
<partitioning>
<method>none</method>
<schema_name/>
</partitioning>
<connection>eduetl</connection>
<skip_lookup>N</skip_lookup>
<commit>100</commit>
<use_batch>N</use_batch>
<error_ignored>N</error_ignored>
<ignore_flag_field/>
<lookup>
<schema/>
<table>kb_document_source</table>
<key>
<name>document_id</name>
<field>document_id</field>
<condition>=</condition>
<name2/>
</key>
<value>
<name>src_api_xml</name>
<rename>src_api_xml</rename>
</value>
</lookup>
<cluster_schema/>
<remotesteps>
<input/>
<output/>
</remotesteps>
<GUI>
<xloc>160</xloc>
<yloc>176</yloc>
<draw>Y</draw>
</GUI>
</step>
<step_error_handling/>
<slave-step-copy-partition-distribution/>
<slave_transformation>N</slave_transformation>
</transformation>