Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 25 additions & 5 deletions .github/workflows/master-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

name: master pull request ci
name: master branch ci
on:
push:
branches: [master]
Expand Down Expand Up @@ -89,16 +89,36 @@ jobs:
- '.github/workflows/*'
# run if the build configuration or both 'core' and 'plugins' files were changed
- name: test all
id: build_all
if: ${{ steps.filter.outputs.buildconf == 'true' || ( steps.filter.outputs.core == 'true' && steps.filter.outputs.plugins == 'true' ) }}
run: ant clean test -buildfile build.xml
run: ant clean test -buildfile build.xml | tee build.log
# run only if 'core' files were changed
- name: test core
id: build_core
if: ${{ steps.filter.outputs.core == 'true' && steps.filter.outputs.plugins == 'false' && steps.filter.outputs.buildconf == 'false' }}
run: ant clean test-core -buildfile build.xml
run: ant clean test-core -buildfile build.xml | tee build.log
# run only if 'plugins' files were changed
- name: test plugins
id: build_plugins
if: ${{ steps.filter.outputs.plugins == 'true' && steps.filter.outputs.core == 'false' && steps.filter.outputs.buildconf == 'false' }}
run: ant clean test-plugins -buildfile build.xml
run: ant clean test-plugins -buildfile build.xml | tee build.log
# check for deprecation warnings in build output
- name: Check for deprecation warnings
if: always()
run: |
if [ -f build.log ]; then
if grep -iEq "warning: \[deprecation\]" build.log ; then
echo "============================================================="
echo "= ❌ Java deprecation warnings detected! Failing the build. ="
echo "============================================================="
grep -iE "warning: \[deprecation\]" -A 2 build.log
exit 1
else
echo "✅ No Java deprecation warnings found."
fi
else
echo "⚠️ build.log not found, skipping deprecation check."
fi
- name: Upload Test Report
uses: actions/upload-artifact@v4
if: always()
Expand All @@ -108,4 +128,4 @@ jobs:
./build/test/TEST-*.xml
./build/**/test/TEST-*.xml
retention-days: 1
overwrite: true
overwrite: true
25 changes: 9 additions & 16 deletions build.xml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@

<property environment="env"/>

<property name="spotbugs.version" value="4.2.0" />
<property name="spotbugs.version" value="4.9.6" />
<property name="spotbugs.home" value="${ivy.dir}/spotbugs-${spotbugs.version}" />
<property name="spotbugs.jar" value="${spotbugs.home}/lib/spotbugs-ant.jar" />

Expand Down Expand Up @@ -79,7 +79,12 @@
</path>

<presetdef name="javac">
<javac includeantruntime="false" />
<javac includeantruntime="false"
encoding="${build.encoding}"
debug="${javac.debug}"
optimize="${javac.optimize}"
release="${javac.version}"
deprecation="${javac.deprecation}"/>
</presetdef>

<target name="dependencytree" depends="resolve-default" description="Show dependency tree">
Expand Down Expand Up @@ -120,15 +125,9 @@

<target name="compile-core" depends="init, resolve-default" description="--> compile core Java files only">
<javac
encoding="${build.encoding}"
srcdir="${src.dir}"
includes="org/apache/nutch/**/*.java"
destdir="${build.classes}"
debug="${javac.debug}"
optimize="${javac.optimize}"
target="${javac.version}"
source="${javac.version}"
deprecation="${javac.deprecation}">
destdir="${build.classes}">
<compilerarg value="-Xlint:-path"/>
<classpath refid="classpath"/>
</javac>
Expand Down Expand Up @@ -450,15 +449,9 @@
<!-- ================================================================== -->
<target name="compile-core-test" depends="init, compile-core, resolve-test" description="--> compile test code">
<javac
encoding="${build.encoding}"
srcdir="${test.src.dir}"
includes="org/apache/nutch/**/*.java"
destdir="${test.build.classes}"
debug="${javac.debug}"
optimize="${javac.optimize}"
target="${javac.version}"
source="${javac.version}"
deprecation="${javac.deprecation}">
destdir="${test.build.classes}">
<compilerarg value="-Xlint:-path"/>
<classpath refid="test.classpath"/>
</javac>
Expand Down
78 changes: 27 additions & 51 deletions src/java/org/apache/nutch/crawl/CrawlDbReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,47 +16,27 @@
*/
package org.apache.nutch.crawl;

import java.io.Closeable;
import java.io.DataOutputStream;
import java.io.File;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Random;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.fasterxml.jackson.core.JsonGenerationException;
import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.core.json.JsonWriteFeature;
import com.fasterxml.jackson.core.util.MinimalPrettyPrinter;
import com.fasterxml.jackson.databind.JsonSerializer;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.ObjectWriter;
import com.fasterxml.jackson.databind.SerializerProvider;
import com.fasterxml.jackson.databind.module.SimpleModule;
import com.tdunning.math.stats.MergingDigest;
import com.tdunning.math.stats.TDigest;
import org.apache.commons.jexl3.JexlScript;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.*;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
Expand All @@ -67,26 +47,22 @@
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.util.AbstractChecker;
import org.apache.nutch.util.JexlUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.SegmentReaderUtil;
import org.apache.nutch.util.StringUtil;
import org.apache.nutch.util.TimingUtil;
import org.apache.nutch.util.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.fasterxml.jackson.core.JsonGenerationException;
import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.core.util.MinimalPrettyPrinter;
import com.fasterxml.jackson.databind.JsonSerializer;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.ObjectWriter;
import com.fasterxml.jackson.databind.SerializerProvider;
import com.fasterxml.jackson.databind.module.SimpleModule;
import com.tdunning.math.stats.MergingDigest;
import com.tdunning.math.stats.TDigest;
import java.io.Closeable;
import java.io.DataOutputStream;
import java.io.File;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.ByteBuffer;
import java.util.*;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* Read utility for the CrawlDB.
Expand Down Expand Up @@ -263,7 +239,7 @@ protected static class LineRecordWriter
public LineRecordWriter(DataOutputStream out) {
this.out = out;
jsonMapper.getFactory()
.configure(JsonGenerator.Feature.ESCAPE_NON_ASCII, true);
.configure(JsonWriteFeature.ESCAPE_NON_ASCII.mappedFeature(), true);
SimpleModule module = new SimpleModule();
module.addSerializer(Writable.class, new WritableSerializer());
jsonMapper.registerModule(module);
Expand Down
9 changes: 0 additions & 9 deletions src/java/org/apache/nutch/indexer/IndexWriter.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,6 @@ public interface IndexWriter extends Pluggable, Configurable {
*/
final static String X_POINT_ID = IndexWriter.class.getName();

/**
* @param conf Nutch configuration
* @param name target name of the {@link IndexWriter} to be opened
* @throws IOException Some exception thrown by some writer.
* @deprecated use {@link #open(IndexWriterParams)}} instead.
*/
@Deprecated
public void open(Configuration conf, String name) throws IOException;

/**
* Initializes the internal variables from a given index writer configuration.
*
Expand Down
2 changes: 1 addition & 1 deletion src/java/org/apache/nutch/indexer/IndexWriters.java
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ private Collection<String> getIndexWriters(NutchDocument doc) {
public void open(Configuration conf, String name) throws IOException {
for (Map.Entry<String, IndexWriterWrapper> entry : this.indexWriters
.entrySet()) {
entry.getValue().getIndexWriter().open(conf, name);
entry.getValue().getIndexWriter().open(new IndexWriterParams(new HashMap<>()));
entry.getValue().getIndexWriter()
.open(entry.getValue().getIndexWriterConfig().getParams());
}
Expand Down
4 changes: 2 additions & 2 deletions src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ private static String normalize(final String str) {
* <li>CoNtEntType gives Content-Type</li>
* <li>ConTnTtYpe gives Content-Type</li>
* </ul>
* If no matching with a well-known metadata name is found, then the original
* If no well-known metadata name match is found, then the original
* name is returned.
*
* @param name
Expand All @@ -115,7 +115,7 @@ public static String getNormalizedName(final String name) {
if ((value == null) && (normalized != null)) {
int threshold = Math.min(3, searched.length() / TRESHOLD_DIVIDER);
for (int i = 0; i < normalized.length && value == null; i++) {
if (StringUtils.getLevenshteinDistance(searched, normalized[i]) < threshold) {
if (StringUtils.compareIgnoreCase(searched, normalized[i]) < threshold) { //.getLevenshteinDistance(searched, normalized[i]) < threshold) {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still need to investigate whether this is a suitable replacement and also need to remove thie comment.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SpellCheckedMetadata is used only by protocol-http and protocol-httpclient. We could deprecate it, use CaseInsensitiveMetadata instead (see NUTCH-3002) and later remove the class SpellCheckedMetadata entirely. Nowadays, spell-checking HTTP headers sounds odd, while 20 years ago it might have been a good idea.

Changing the behavior in opposite to the name does not seem the right way.

If we want to keep the class, we need to use LevenshteinDistance.

value = NAMES_IDX.get(normalized[i]);
}
}
Expand Down
5 changes: 0 additions & 5 deletions src/java/org/apache/nutch/plugin/Plugin.java
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,4 @@ public PluginDescriptor getDescriptor() {
private void setDescriptor(PluginDescriptor descriptor) {
fDescriptor = descriptor;
}

@Override
protected void finalize() throws Throwable {
shutDown();
Copy link
Member Author

@lewismc lewismc Oct 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure we can simply remove the call to shutdown. I need to further investigate options and confirm.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same for me.

}
}
27 changes: 13 additions & 14 deletions src/java/org/apache/nutch/plugin/PluginRepository.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import java.lang.invoke.MethodHandles;
import java.lang.reflect.Array;
import java.lang.ref.Cleaner;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
Expand Down Expand Up @@ -70,6 +71,8 @@ public class PluginRepository implements URLStreamHandlerFactory {

protected static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

private static final Cleaner CLEANER = Cleaner.create();

/**
* @param conf a populated {@link Configuration}
* @throws RuntimeException if a fatal runtime error is encountered
Expand Down Expand Up @@ -98,13 +101,22 @@ public PluginRepository(Configuration conf) throws RuntimeException {
try {
installExtensions(this.fRegisteredPlugins);
} catch (PluginRuntimeException e) {
LOG.error("Could not install extensions.", e.toString());
LOG.error("Could not install extensions. {}", e.toString());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1

Or: LOG.error("Could not install extensions:", e);

throw new RuntimeException(e.getMessage());
}

registerURLStreamHandlerFactory();

displayStatus();

// Register cleanup action with Cleaner
CLEANER.register(this, () -> {
try {
shutDownActivatedPlugins();
} catch (PluginRuntimeException e) {
LOG.error("Error during cleanup of activated plugins", e);
}
});
}

/**
Expand Down Expand Up @@ -313,19 +325,6 @@ public Plugin getPluginInstance(PluginDescriptor pDescriptor)
}
}

/**
* Attempts to shut down all activated plugins.
* @deprecated
* @see <a href="https://openjdk.java.net/jeps/421">JEP 421: Deprecate Finalization for Removal</a>
* @see java.lang.Object#finalize()
* @deprecated
*/
@Override
@Deprecated
public void finalize() throws Throwable {
shutDownActivatedPlugins();
}

/**
* Shuts down all plugins
*
Expand Down
21 changes: 11 additions & 10 deletions src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,9 @@

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.lang3.time.StopWatch;
import org.slf4j.Logger;
Expand Down Expand Up @@ -417,18 +416,20 @@ public static void main(String[] args) throws Exception {
public int run(String[] args) throws Exception {

Options options = new Options();
OptionBuilder.withArgName("help");
OptionBuilder.withDescription("show this help message");
Option helpOpts = OptionBuilder.create("help");
Option helpOpts = Option.builder("help")
.argName("help")
.desc("show this help message")
.build();
options.addOption(helpOpts);

OptionBuilder.withArgName("webgraphdb");
OptionBuilder.hasArg();
OptionBuilder.withDescription("the web graph database to use");
Option webGraphDbOpts = OptionBuilder.create("webgraphdb");
Option webGraphDbOpts = Option.builder("webgraphdb")
.argName("webgraphdb")
.hasArg()
.desc("the web graph database to use")
.build();
options.addOption(webGraphDbOpts);

CommandLineParser parser = new GnuParser();
CommandLineParser parser = new DefaultParser();
try {

CommandLine line = parser.parse(options, args);
Expand Down
Loading