跳转至

2. hadoop伪分布式搭建

0. 环境

# 192.168.178.153
# 1c2G

1. 查看默认的hadoop配置文件

cd /soft/hadoop/etc
ls
[centos@153 etc]$ ls
hadoop
[centos@153 etc]$
[centos@153 etc]$
[centos@153 etc]$ ll hadoop/
total 164
-rw-r--r-- 1 centos centos  8814 Sep 14  2020 capacity-scheduler.xml
-rw-r--r-- 1 centos centos  1335 Sep 14  2020 configuration.xsl
-rw-r--r-- 1 centos centos  1211 Sep 14  2020 container-executor.cfg
-rw-r--r-- 1 centos centos   774 Sep 14  2020 core-site.xml
-rw-r--r-- 1 centos centos  4133 Sep 14  2020 hadoop-env.cmd
-rw-r--r-- 1 centos centos  4969 Sep 14  2020 hadoop-env.sh
-rw-r--r-- 1 centos centos  2598 Sep 14  2020 hadoop-metrics2.properties
-rw-r--r-- 1 centos centos  2490 Sep 14  2020 hadoop-metrics.properties
-rw-r--r-- 1 centos centos 10206 Sep 14  2020 hadoop-policy.xml
-rw-r--r-- 1 centos centos   775 Sep 14  2020 hdfs-site.xml
-rw-r--r-- 1 centos centos  2432 Sep 14  2020 httpfs-env.sh
-rw-r--r-- 1 centos centos  1657 Sep 14  2020 httpfs-log4j.properties
-rw-r--r-- 1 centos centos    21 Sep 14  2020 httpfs-signature.secret
-rw-r--r-- 1 centos centos   620 Sep 14  2020 httpfs-site.xml
-rw-r--r-- 1 centos centos  3518 Sep 14  2020 kms-acls.xml
-rw-r--r-- 1 centos centos  3139 Sep 14  2020 kms-env.sh
-rw-r--r-- 1 centos centos  1788 Sep 14  2020 kms-log4j.properties
-rw-r--r-- 1 centos centos  5939 Sep 14  2020 kms-site.xml
-rw-r--r-- 1 centos centos 14016 Sep 14  2020 log4j.properties
-rw-r--r-- 1 centos centos  1076 Sep 14  2020 mapred-env.cmd
-rw-r--r-- 1 centos centos  1507 Sep 14  2020 mapred-env.sh
-rw-r--r-- 1 centos centos  4113 Sep 14  2020 mapred-queues.xml.template
-rw-r--r-- 1 centos centos   758 Sep 14  2020 mapred-site.xml.template
-rw-r--r-- 1 centos centos    10 Sep 14  2020 slaves
-rw-r--r-- 1 centos centos  2316 Sep 14  2020 ssl-client.xml.example
-rw-r--r-- 1 centos centos  2697 Sep 14  2020 ssl-server.xml.example
-rw-r--r-- 1 centos centos  2250 Sep 14  2020 yarn-env.cmd
-rw-r--r-- 1 centos centos  4876 Sep 14  2020 yarn-env.sh
-rw-r--r-- 1 centos centos   690 Sep 14  2020 yarn-site.xml

2. 复制配置文件

# local为本地模式,pseudo为伪分布式,full为完全分布式
cp -r hadoop local
cp -r hadoop pseudo
cp -r hadoop full
ls
[centos@153 etc]$ ls
full  hadoop  local  pseudo

3. 删除(或备份)原配置文件

rm -rf hadoop
ls
# 或者(我觉得备份更好一些!)
mv hadoop hadoop.bak
ls
[centos@153 etc]$ ls
full  hadoop.bak  local  pseudo

4. 创建软链接

# 配置3套配置文件,软链接到local,就是本地模式
# 软链接到pseudo,则为伪分布式
ln -s pseudo hadoop

5. 修改伪分布式需要的配置文件

1. core-site.xml

cd pseudo
# 删除原来的内容,写入以下内容
vim core-site.xml
<?xml version="1.0"?>
<configuration>
    <property>
        <name>fs.defaultFS</name>
        <value>hdfs://localhost/</value>
    </property>
</configuration>

2. hdfs-site.xml

vim hdfs-site.xml
<?xml version="1.0"?>
<configuration>
    <property>
        <name>dfs.replication</name>
        <value>1</value>
    </property>
</configuration>

3. mapred-site.xml

vim mapred-site.xml
<?xml version="1.0"?>
<configuration>
    <property>
        <name>mapreduce.framework.name</name>
        <value>yarn</value>
    </property>
</configuration>

4. yarn-site.xml

vim yarn-site.xml
<?xml version="1.0"?>
<configuration>
    <property>
        <name>yarn.resourcemanager.hostname</name>
        <value>localhost</value>
    </property>
    <property>
        <name>yarn.nodemanager.aux-services</name>
        <value>mapreduce_shuffle</value>
    </property>
</configuration>

5. hadoop-env.sh

vim hadoop-env.sh
# 修改25行,指定jdk的JAVA_HOME
export JAVA_HOME=/soft/jdk

6. 配置免密登录

#1.生成密钥
ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa

#2.发送密钥
ssh-copy-id 192.168.178.153

#3.测试
ssh centos@192.168.178.153

#4.添加本机的host解析,再次测试
sudo hostnamectl set-hostname s100
bash
sudo vim /etc/hosts
#写入如下
192.168.178.153 s100

#5.再次测试
ssh s100
[centos@s100 pseudo]$ ssh s100
The authenticity of host 's100 (192.168.178.153)' can't be established.
ECDSA key fingerprint is SHA256:9pWhPJBa1H/MUWHiYx2FGVbYpeLzjdlfHXnHZBjmHF0.
ECDSA key fingerprint is MD5:0b:95:7e:f0:ab:df:87:8b:3d:74:a3:f0:e7:c6:20:c8.
Are you sure you want to continue connecting (yes/no)? yes
Warning: Permanently added 's100' (ECDSA) to the list of known hosts.
Last login: Tue Dec 14 00:40:46 2021 from 192.168.178.153
[centos@s100 ~]$

7. 格式化hdfs文件系统

hdfs namenode -format

8. 查看java进程

jps

sudo netstat -tunlp
[centos@s100 etc]$ jps
19601 Jps
[centos@s100 etc]$ sudo netstat -tunlp
Active Internet connections (only servers)
Proto Recv-Q Send-Q Local Address           Foreign Address         State       PID/Program name
tcp        0      0 0.0.0.0:22              0.0.0.0:*               LISTEN      7338/sshd
tcp6       0      0 :::22                   :::*                    LISTEN      7338/sshd

9. 启动hadoop进程

start-all.sh

10. 再次查看进程验证安装是否成功

jps
# 发现多了5个进程!
20880 ResourceManager
20291 NameNode
20439 DataNode
20697 SecondaryNameNode
21002 NodeManager
[centos@s100 etc]$ jps
20880 ResourceManager
20291 NameNode
21380 Jps
20439 DataNode
20697 SecondaryNameNode
21002 NodeManager
[centos@s100 etc]$
[centos@s100 etc]$ sudo netstat -tunlp
Active Internet connections (only servers)
Proto Recv-Q Send-Q Local Address           Foreign Address         State       PID/Program name
tcp        0      0 0.0.0.0:50090           0.0.0.0:*               LISTEN      20697/java
tcp        0      0 127.0.0.1:37808         0.0.0.0:*               LISTEN      20439/java
tcp        0      0 127.0.0.1:8020          0.0.0.0:*               LISTEN      20291/java
tcp        0      0 0.0.0.0:50070           0.0.0.0:*               LISTEN      20291/java
tcp        0      0 0.0.0.0:22              0.0.0.0:*               LISTEN      7338/sshd
tcp        0      0 0.0.0.0:50010           0.0.0.0:*               LISTEN      20439/java
tcp        0      0 0.0.0.0:50075           0.0.0.0:*               LISTEN      20439/java
tcp        0      0 0.0.0.0:50020           0.0.0.0:*               LISTEN      20439/java
tcp6       0      0 :::8040                 :::*                    LISTEN      21002/java
tcp6       0      0 :::8042                 :::*                    LISTEN      21002/java
tcp6       0      0 :::36465                :::*                    LISTEN      21002/java
tcp6       0      0 :::22                   :::*                    LISTEN      7338/sshd
tcp6       0      0 127.0.0.1:8088          :::*                    LISTEN      20880/java
tcp6       0      0 :::13562                :::*                    LISTEN      21002/java
tcp6       0      0 127.0.0.1:8030          :::*                    LISTEN      20880/java
tcp6       0      0 127.0.0.1:8031          :::*                    LISTEN      20880/java
tcp6       0      0 127.0.0.1:8032          :::*                    LISTEN      20880/java
tcp6       0      0 127.0.0.1:8033          :::*                    LISTEN      20880/java

11. 查看hdfs文件系统

hdfs dfs -ls
# 发现什么也看不到,因为已经是伪分布式了!
[centos@s100 etc]$ hdfs dfs -ls
ls: `.': No such file or directory
[centos@s100 etc]$
[centos@s100 etc]$ hdfs dfs -ls /
[centos@s100 etc]$

1. 切换成本地模式

#1.先删除之前的软链接
cd /soft/hadoop/etc
rm -rf hadoop   # 注意:rm -rf hadoop/ 是把软链接和pseudo都删除!!!

#2.创建本地模式软链接
ln -s local hadoop
ll

2. 再次查看hdfs文件系统

[centos@s100 etc]$ hdfs dfs -ls
Found 5 items
drwxr-xr-x   - centos centos       4096 2021-12-14 00:20 full
drwxr-xr-x   - centos centos       4096 2021-12-14 00:20 hadoop
drwxr-xr-x   - centos centos       4096 2021-12-14 00:28 hadoop.bak
drwxr-xr-x   - centos centos       4096 2021-12-14 00:20 local
drwxr-xr-x   - centos centos       4096 2021-12-14 00:55 pseudo

3. 再次切换成伪分布式

#1.先删除之前的软链接
cd /soft/hadoop/etc
rm -rf hadoop   

#2.创建本地模式软链接
ln -s pseudo hadoop
ll

12. 查看端口50070

netstat -tunlp|grep 50070
[root@s100 ~]# netstat -tunlp
Active Internet connections (only servers)
Proto Recv-Q Send-Q Local Address           Foreign Address         State       PID/Program name
tcp        0      0 0.0.0.0:50090           0.0.0.0:*               LISTEN      20697/java
tcp        0      0 127.0.0.1:37808         0.0.0.0:*               LISTEN      20439/java
tcp        0      0 127.0.0.1:8020          0.0.0.0:*               LISTEN      20291/java
tcp        0      0 0.0.0.0:50070           0.0.0.0:*               LISTEN      20291/java
tcp        0      0 0.0.0.0:22              0.0.0.0:*               LISTEN      7338/sshd
tcp        0      0 0.0.0.0:50010           0.0.0.0:*               LISTEN      20439/java
tcp        0      0 0.0.0.0:50075           0.0.0.0:*               LISTEN      20439/java
tcp        0      0 0.0.0.0:50020           0.0.0.0:*               LISTEN      20439/java
tcp6       0      0 :::8040                 :::*                    LISTEN      21002/java
tcp6       0      0 :::8042                 :::*                    LISTEN      21002/java
tcp6       0      0 :::36465                :::*                    LISTEN      21002/java
tcp6       0      0 :::22                   :::*                    LISTEN      7338/sshd
tcp6       0      0 127.0.0.1:8088          :::*                    LISTEN      20880/java
tcp6       0      0 :::13562                :::*                    LISTEN      21002/java
tcp6       0      0 127.0.0.1:8030          :::*                    LISTEN      20880/java
tcp6       0      0 127.0.0.1:8031          :::*                    LISTEN      20880/java
tcp6       0      0 127.0.0.1:8032          :::*                    LISTEN      20880/java
tcp6       0      0 127.0.0.1:8033          :::*                    LISTEN      20880/java
[root@s100 ~]# netstat -tunlp|grep 50070
tcp        0      0 0.0.0.0:50070           0.0.0.0:*               LISTEN      20291/java
# 发现是20291进程号,通过jps发现,对应的进程是:20291 NameNode

13. 查看50070端口当前访问情况

[root@s100 ~]# netstat -anop|grep 50070
tcp        0      0 0.0.0.0:50070           0.0.0.0:*               LISTEN      20291/java           off (0.00/0/0)

14. 浏览器访问

# 192.168.178.153:50070

image-20211214093412494

15. 再次查看50070端口当前访问情况

[root@s100 ~]# netstat -anop|grep 50070
tcp        0      0 0.0.0.0:50070           0.0.0.0:*               LISTEN      20291/java           off (0.00/0/0)
tcp        0      0 192.168.178.153:50070   192.168.178.1:58052     ESTABLISHED 20291/java           off (0.00/0/0)
tcp        0      0 192.168.178.153:50070   192.168.178.1:58048     ESTABLISHED 20291/java           off (0.00/0/0)
tcp        0      0 192.168.178.153:50070   192.168.178.1:58051     ESTABLISHED 20291/java           off (0.00/0/0)
tcp        0      0 192.168.178.153:50070   192.168.178.1:58049     ESTABLISHED 20291/java           off (0.00/0/0)
tcp        0      0 192.168.178.153:50070   192.168.178.1:58050     ESTABLISHED 20291/java           off (0.00/0/0)
tcp        0      0 192.168.178.153:50070   192.168.178.1:58034     ESTABLISHED 20291/java           off (0.00/0/0)
# 所以我们进的是 20291 NameNode 的web界面!!!

1 分析

[root@s100 ~]# jps
20880 ResourceManager
20291 NameNode
20439 DataNode
20697 SecondaryNameNode
28105 Jps
21002 NodeManager

# 其中这三个是hdfs文件系统进程
20291 NameNode
20439 DataNode
20697 SecondaryNameNode
#这两个是mapreduce进程
20880 ResourceManager
21002 NodeManager

2 查看datanode和secondarynamenode是否有web界面

netstat -anop|grep 20439
netstat -anop|grep 20697
[root@s100 ~]# netstat -anop|grep 20439
tcp        0      0 127.0.0.1:37808         0.0.0.0:*               LISTEN      20439/java           off (0.00/0/0)
tcp        0      0 0.0.0.0:50010           0.0.0.0:*               LISTEN      20439/java           off (0.00/0/0)
tcp        0      0 0.0.0.0:50075           0.0.0.0:*               LISTEN      20439/java           off (0.00/0/0)
tcp        0      0 0.0.0.0:50020           0.0.0.0:*               LISTEN      20439/java           off (0.00/0/0)
tcp        0      0 127.0.0.1:55340         127.0.0.1:8020          ESTABLISHED 20439/java           keepalive (5978.75/0/0)
unix  2      [ ]         STREAM     CONNECTED     60895    20439/java
unix  3      [ ]         STREAM     CONNECTED     60914    20439/java
unix  3      [ ]         STREAM     CONNECTED     60913    20439/java
unix  2      [ ]         STREAM     CONNECTED     60909    20439/java
[root@s100 ~]# netstat -anop|grep 20697
tcp        0      0 0.0.0.0:50090           0.0.0.0:*               LISTEN      20697/java           off (0.00/0/0)
unix  2      [ ]         STREAM     CONNECTED     62117    20697/java
unix  2      [ ]         STREAM     CONNECTED     62102    20697/java

16. 测试hdfs系统-创建文件

# su - centos
hdfs dfs -touchz 1.txt /
[root@s100 ~]# su - centos
Last login: Tue Dec 14 00:45:29 CST 2021 from s100 on pts/1
[centos@s100 ~]$
[centos@s100 ~]$
[centos@s100 ~]$
[centos@s100 ~]$ hdfs dfs -touchz /1.txt
[centos@s100 ~]$ hdfs dfs -ls /
Found 1 items
-rw-r--r--   1 centos supergroup          0 2021-12-14 09:46 /1.txt
[centos@s100 ~]$

web界面查看

# 点击右侧的utilities,然后浏览文件系统
# 发现刚才创建的文件已经能看到了

image-20211214094730920

17. 测试mapreduce系统——词频统计

1. 网上找出一段英文

# https://hadoop.apache.org/docs/stable/
vim wordcount.txt
# 粘贴如下内容
Apache Hadoop 3.3.1

Apache Hadoop 3.3.1 incorporates a number of significant enhancements over the previous major release line (hadoop-2.x).

This release is generally available (GA), meaning that it represents a point of API stability and quality that we consider production-ready.
Overview

Users are encouraged to read the full set of release notes. This page provides an overview of the major changes.
Minimum required Java version increased from Java 7 to Java 8

All Hadoop JARs are now compiled targeting a runtime version of Java 8. Users still using Java 7 or below must upgrade to Java 8.
Support for erasure coding in HDFS

Erasure coding is a method for durably storing data with significant space savings compared to replication. Standard encodings like Reed-Solomon (10,4) have a 1.4x space overhead, compared to the 3x overhead of standard HDFS replication.

Since erasure coding imposes additional overhead during reconstruction and performs mostly remote reads, it has traditionally been used for storing colder, less frequently accessed data. Users should consider the network and CPU overheads of erasure coding when deploying this feature.

More details are available in the HDFS Erasure Coding documentation.
YARN Timeline Service v.2

We are introducing an early preview (alpha 2) of a major revision of YARN Timeline Service: v.2. YARN Timeline Service v.2 addresses two major challenges: improving scalability and reliability of Timeline Service, and enhancing usability by introducing flows and aggregation.

YARN Timeline Service v.2 alpha 2 is provided so that users and developers can test it and provide feedback and suggestions for making it a ready replacement for Timeline Service v.1.x. It should be used only in a test capacity.

More details are available in the YARN Timeline Service v.2 documentation.
Shell script rewrite

The Hadoop shell scripts have been rewritten to fix many long-standing bugs and include some new features. While an eye has been kept towards compatibility, some changes may break existing installations.

Incompatible changes are documented in the release notes, with related discussion on HADOOP-9902.

More details are available in the Unix Shell Guide documentation. Power users will also be pleased by the Unix Shell API documentation, which describes much of the new functionality, particularly related to extensibility.
Shaded client jars

The hadoop-client Maven artifact available in 2.x releases pulls Hadoop’s transitive dependencies onto a Hadoop application’s classpath. This can be problematic if the versions of these transitive dependencies conflict with the versions used by the application.

HADOOP-11804 adds new hadoop-client-api and hadoop-client-runtime artifacts that shade Hadoop’s dependencies into a single jar. This avoids leaking Hadoop’s dependencies onto the application’s classpath.
Support for Opportunistic Containers and Distributed Scheduling.

A notion of ExecutionType has been introduced, whereby Applications can now request for containers with an execution type of Opportunistic. Containers of this type can be dispatched for execution at an NM even if there are no resources available at the moment of scheduling. In such a case, these containers will be queued at the NM, waiting for resources to be available for it to start. Opportunistic containers are of lower priority than the default Guaranteed containers and are therefore preempted, if needed, to make room for Guaranteed containers. This should improve cluster utilization.

Opportunistic containers are by default allocated by the central RM, but support has also been added to allow opportunistic containers to be allocated by a distributed scheduler which is implemented as an AMRMProtocol interceptor.

Please see documentation for more details.
MapReduce task-level native optimization

MapReduce has added support for a native implementation of the map output collector. For shuffle-intensive jobs, this can lead to a performance improvement of 30% or more.

See the release notes for MAPREDUCE-2841 for more detail.
Support for more than 2 NameNodes.

The initial implementation of HDFS NameNode high-availability provided for a single active NameNode and a single Standby NameNode. By replicating edits to a quorum of three JournalNodes, this architecture is able to tolerate the failure of any one node in the system.

However, some deployments require higher degrees of fault-tolerance. This is enabled by this new feature, which allows users to run multiple standby NameNodes. For instance, by configuring three NameNodes and five JournalNodes, the cluster is able to tolerate the failure of two nodes rather than just one.

The HDFS high-availability documentation has been updated with instructions on how to configure more than two NameNodes.
Default ports of multiple services have been changed.

Previously, the default ports of multiple Hadoop services were in the Linux ephemeral port range (32768-61000). This meant that at startup, services would sometimes fail to bind to the port due to a conflict with another application.

These conflicting ports have been moved out of the ephemeral range, affecting the NameNode, Secondary NameNode, DataNode, and KMS. Our documentation has been updated appropriately, but see the release notes for HDFS-9427 and HADOOP-12811 for a list of port changes.
Support for Microsoft Azure Data Lake and Aliyun Object Storage System filesystem connectors

Hadoop now supports integration with Microsoft Azure Data Lake and Aliyun Object Storage System as alternative Hadoop-compatible filesystems.
Intra-datanode balancer

A single DataNode manages multiple disks. During normal write operation, disks will be filled up evenly. However, adding or replacing disks can lead to significant skew within a DataNode. This situation is not handled by the existing HDFS balancer, which concerns itself with inter-, not intra-, DN skew.

This situation is handled by the new intra-DataNode balancing functionality, which is invoked via the hdfs diskbalancer CLI. See the disk balancer section in the HDFS Commands Guide for more information.
Reworked daemon and task heap management

A series of changes have been made to heap management for Hadoop daemons as well as MapReduce tasks.

HADOOP-10950 introduces new methods for configuring daemon heap sizes. Notably, auto-tuning is now possible based on the memory size of the host, and the HADOOP_HEAPSIZE variable has been deprecated. See the full release notes of HADOOP-10950 for more detail.

MAPREDUCE-5785 simplifies the configuration of map and reduce task heap sizes, so the desired heap size no longer needs to be specified in both the task configuration and as a Java option. Existing configs that already specify both are not affected by this change. See the full release notes of MAPREDUCE-5785 for more details.
S3Guard: Consistency and Metadata Caching for the S3A filesystem client

HADOOP-13345 adds an optional feature to the S3A client of Amazon S3 storage: the ability to use a DynamoDB table as a fast and consistent store of file and directory metadata.

See S3Guard for more details.
HDFS Router-Based Federation

HDFS Router-Based Federation adds a RPC routing layer that provides a federated view of multiple HDFS namespaces. This is similar to the existing ViewFs) and HDFS Federation functionality, except the mount table is managed on the server-side by the routing layer rather than on the client. This simplifies access to a federated cluster for existing HDFS clients.

See HDFS-10467 and the HDFS Router-based Federation documentation for more details.
API-based configuration of Capacity Scheduler queue configuration

The OrgQueue extension to the capacity scheduler provides a programmatic way to change configurations by providing a REST API that users can call to modify queue configurations. This enables automation of queue configuration management by administrators in the queue’s administer_queue ACL.

See YARN-5734 and the Capacity Scheduler documentation for more information.
YARN Resource Types

The YARN resource model has been generalized to support user-defined countable resource types beyond CPU and memory. For instance, the cluster administrator could define resources like GPUs, software licenses, or locally-attached storage. YARN tasks can then be scheduled based on the availability of these resources.

See YARN-3926 and the YARN resource model documentation for more information.
Getting Started

The Hadoop documentation includes the information you need to get started using Hadoop. Begin with the Single Node Setup which shows you how to set up a single-node Hadoop installation. Then move on to the Cluster Setup to learn how to set up a multi-node Hadoop installation.

2. 将wordcount.txt推送到hdfs

hdfs dfs -put wordcount.txt /
[centos@s100 ~]$ hdfs dfs -put wordcount.txt /
[centos@s100 ~]$
[centos@s100 ~]$ hdfs dfs -ls /
Found 2 items
-rw-r--r--   1 centos supergroup          0 2021-12-14 09:46 /1.txt
-rw-r--r--   1 centos supergroup       9487 2021-12-14 09:54 /wordcount.txt

3. 查看mapreduce自带的demo

cd /soft/hadoop/share/hadoop/mapreduce
ll
[centos@s100 mapreduce]$ ll
total 5244
-rw-r--r-- 1 centos centos  586815 Sep 14  2020 hadoop-mapreduce-client-app-2.10.1.jar
-rw-r--r-- 1 centos centos  787989 Sep 14  2020 hadoop-mapreduce-client-common-2.10.1.jar
-rw-r--r-- 1 centos centos 1613911 Sep 14  2020 hadoop-mapreduce-client-core-2.10.1.jar
-rw-r--r-- 1 centos centos  199675 Sep 14  2020 hadoop-mapreduce-client-hs-2.10.1.jar
-rw-r--r-- 1 centos centos   32779 Sep 14  2020 hadoop-mapreduce-client-hs-plugins-2.10.1.jar
-rw-r--r-- 1 centos centos   72212 Sep 14  2020 hadoop-mapreduce-client-jobclient-2.10.1.jar
-rw-r--r-- 1 centos centos 1652223 Sep 14  2020 hadoop-mapreduce-client-jobclient-2.10.1-tests.jar
-rw-r--r-- 1 centos centos   84008 Sep 14  2020 hadoop-mapreduce-client-shuffle-2.10.1.jar
-rw-r--r-- 1 centos centos  303324 Sep 14  2020 hadoop-mapreduce-examples-2.10.1.jar
drwxr-xr-x 2 centos centos    4096 Sep 14  2020 jdiff
drwxr-xr-x 2 centos centos    4096 Sep 14  2020 lib
drwxr-xr-x 2 centos centos      30 Sep 14  2020 lib-examples
drwxr-xr-x 2 centos centos    4096 Sep 14  2020 sources

4. 运行自带的demo测试

hadoop jar /soft/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.10.1.jar wordcount /wordcount.txt /out
# hadoop jar:运行jar包
# jar包貌似必须写绝对路径!
# wordcount:需要做的事是统计单词
# /wordcount.txt:要处理的文件
# /out:输出到哪个wen'jian'j,注意:输出的文件夹必须不存在,否则报错!

5. 查看

hdfs dfs -ls /out
hdfs dfs -cat /out/part-r-00000
[centos@s100 ~]$ hdfs dfs -ls /out
Found 2 items
-rw-r--r--   1 centos supergroup          0 2021-12-14 10:02 /out/_SUCCESS
-rw-r--r--   1 centos supergroup       6745 2021-12-14 10:02 /out/part-r-00000
[centos@s100 ~]$
[centos@s100 ~]$
[centos@s100 ~]$
[centos@s100 ~]$ hdfs dfs -cat /out/part-r-00000
"AS     1
"License");     1
(10,4)  1
(32768-61000).  1
(GA),   1
(alpha  1
(hadoop-2.x).   1
(the    1
-->     2
1.4x    1
2       2
2)      1
2.0     1
2.x     1
3.3.1   2
30%     1
3x      1
7       2
8       1
8.      2
<!--    2
</configuration>        1
<configuration> 1
A       3
ACL.    1
AMRMProtocol    1
ANY     1
API     3
API-based       1
Aliyun  2
All     1
Amazon  1
Apache  3
Applications    1
Azure   2
BASIS,  1
Begin   1
By      1
CLI.    1
CONDITIONS      1
CPU     2
Caching 1
Capacity        2
Cluster 1
Coding  1
Commands        1
Consistency     1
Containers      2
DN      1
Data    2
DataNode        1
DataNode,       1
DataNode.       1
Default 1
Distributed     1
During  1
DynamoDB        1
Erasure 2
ExecutionType   1
Existing        1
Federation      4
For     3
GPUs,   1
Getting 1
Guaranteed      2
Guide   2
HADOOP-10950    2
HADOOP-11804    1
HADOOP-12811    1
HADOOP-13345    1
HADOOP-9902.    1
HADOOP_HEAPSIZE 1
HDFS    13
HDFS-10467      1
HDFS-9427       1
Hadoop  11
Hadoop-compatible       1
Hadoop. 1
Hadoop’s        3
However,        2
IS"     1
In      1
Incompatible    1
Intra-datanode  1
It      1
JARs    1
Java    7
JournalNodes,   2
KIND,   1
KMS.    1
LICENSE 1
Lake    2
License 3
License,        1
License.        2
Licensed        1
Linux   1
MAPREDUCE-2841  1
MAPREDUCE-5785  2
MapReduce       3
Maven   1
Metadata        1
Microsoft       2
Minimum 1
More    3
NM      1
NM,     1
NameNode        2
NameNode,       2
NameNode.       1
NameNodes       1
NameNodes.      3
Node    1
Notably,        1
OF      1
OR      1
Object  2
Opportunistic   3
Opportunistic.  1
OrgQueue        1
Our     1
Overview        1
Please  1
Power   1
Previously,     1
REST    1
RM,     1
RPC     1
Reed-Solomon    1
Resource        1
Reworked        1
Router-Based    2
Router-based    1
S3      1
S3A     2
S3Guard 1
S3Guard:        1
Scheduler       2
Scheduling.     1
Secondary       1
See     10
Service 5
Service,        1
Service:        1
Setup   2
Shaded  1
Shell   3
Since   1
Single  1
Site    1
Standard        1
Standby 1
Started 1
Storage 2
Support 4
System  2
The     7
Then    1
These   1
This    12
Timeline        7
Types   1
Unix    2
Unless  1
Users   3
Version 1
ViewFs) 1
WARRANTIES      1
WITHOUT 1
We      1
While   1
YARN    10
YARN-3926       1
YARN-5734       1
You     1
a       31
ability 1
able    2
access  1
accessed        1
accompanying    1
active  1
added   2
adding  1
additional      1
addresses       1
adds    3
administer_queue        1
administrator   1
administrators  1
affected        1
affecting       1
aggregation.    1
agreed  1
allocated       2
allow   1
allows  1
alpha   1
already 1
also    2
alternative     1
an      8
and     32
another 1
any     1
applicable      1
application.    2
application’s   2
appropriately,  1
architecture    1
are     12
artifact        1
artifacts       1
as      6
at      5
auto-tuning     1
automation      1
availability    1
available       7
avoids  1
balancer        2
balancer,       1
balancing       1
based   2
be      10
been    12
below   1
beyond  1
bind    1
both    2
break   1
bugs    1
but     2
by      15
call    1
can     8
capacity        1
capacity.       1
case,   1
central 1
challenges:     1
change  1
change. 1
changed.        1
changes 3
changes.        2
classpath.      2
client  3
client. 1
clients.        1
cluster 4
coding  4
colder, 1
collector.      1
compared        2
compatibility,  1
compiled        1
compliance      1
concerns        1
configs 1
configuration   6
configurations  1
configurations. 1
configure       1
configuring     2
conflict        2
conflicting     1
connectors      1
consider        2
consistent      1
containers      6
containers.     1
copy    1
could   1
countable       1
daemon  2
daemons 1
data    1
data.   1
default 3
define  1
degrees 1
dependencies    4
deploying       1
deployments     1
deprecated.     1
describes       1
desired 1
detail. 2
details 3
details.        4
developers      1
directory       1
discussion      1
disk    1
diskbalancer    1
disks   2
disks.  1
dispatched      1
distributed     3
documentation   7
documentation,  1
documentation.  3
documented      1
due     1
durably 1
during  1
early   1
edits   1
either  1
enabled 1
enables 1
encodings       1
encouraged      1
enhancements    1
enhancing       1
ephemeral       2
erasure 3
even    1
evenly. 1
except  2
execution       2
existing        4
express 1
extensibility.  1
extension       1
eye     1
fail    1
failure 2
fast    1
fault-tolerance.        1
feature 1
feature,        1
feature.        1
features.       1
federated       2
feedback        1
file    2
file.   1
filesystem      2
filesystems.    1
filled  1
five    1
fix     1
flows   1
for     32
frequently      1
from    1
full    3
functionality,  3
generalized     1
generally       1
get     1
governing       1
hadoop-client   1
hadoop-client-api       1
hadoop-client-runtime   1
handled 2
has     9
have    5
hdfs    1
heap    5
high-availability       2
higher  1
host,   1
how     3
http://www.apache.org/licenses/LICENSE-2.0      1
if      3
implementation  2
implemented     1
implied.        1
imposes 1
improve 1
improvement     1
improving       1
in      14
include 1
includes        1
incorporates    1
increased       1
information     1
information.    3
initial 1
installation.   1
installation.?xml       1
installations.  1
instance,       2
instructions    1
integration     1
inter-, 1
interceptor.    1
into    1
intra-, 1
intra-DataNode  1
introduced,     1
introduces      1
introducing     2
invoked 1
is      14
it      5
itself  1
jar.    1
jars    1
jobs,   1
just    1
kept    1
language        1
law     1
layer   2
lead    2
leaking 1
learn   1
less    1
licenses,       1
like    2
limitations     1
line    1
list    1
locally-attached        1
long-standing   1
longer  1
lower   1
made    1
major   4
make    1
making  1
managed 1
management      3
manages 1
many    1
map     2
may     3
meaning 1
meant   1
memory  1
memory. 1
metadata.       1
method  1
methods 1
model   2
modify  1
moment  1
more    11
more.   1
mostly  1
mount   1
move    1
moved   1
much    1
multi-node      1
multiple        5
must    1
namespaces.     1
native  2
need    1
needed, 1
needs   1
network 1
new     6
no      2
node    1
nodes   1
normal  1
not     4
notes   4
notes,  1
notes.  1
notion  1
now     4
number  1
obtain  1
of      40
on      8
one     1
one.    1
only    1
onto    2
operation,      1
opportunistic   1
optimization    1
option. 1
optional        1
or      6
out     1
output  1
over    1
overhead        2
overhead,       1
overheads       1
overview        1
page    1
particularly    1
performance     1
performs        1
permissions     1
pleased 1
point   1
port    3
ports   3
possible        1
preempted,      1
preview 1
previous        1
priority        1
problematic     1
production-ready.       1
programmatic    1
properties      1
provide 1
provided        2
provides        3
providing       1
pulls   1
quality 1
queue   3
queued  1
queue’s 1
quorum  1
range   1
range,  1
rather  2
read    1
reads,  1
ready   1
reconstruction  1
reduce  1
related 2
release 8
releases        1
reliability     1
remote  1
replacement     1
replacing       1
replicating     1
replication.    2
represents      1
request 1
require 1
required        2
resource        3
resources       3
resources.      1
revision        1
rewrite 1
rewritten       1
room    1
routing 2
run     1
runtime 1
savings 1
scalability     1
scheduled       1
scheduler       2
scheduling.     1
script  1
scripts 1
section 1
see     2
series  1
server-side     1
services        3
set     3
shade   1
shell   1
should  3
shows   1
shuffle-intensive       1
significant     3
similar 1
simplifies      2
single  4
single-node     1
situation       2
size    2
sizes,  1
sizes.  1
skew    1
skew.   1
so      2
software        2
some    3
sometimes       1
space   2
specific        2
specified       1
specify 1
stability       1
standard        1
standby 1
start.  1
started 1
startup,        1
still   1
storage.        1
storage:        1
store   1
storing 2
such    1
suggestions     1
support 3
supports        1
system. 1
table   2
targeting       1
task    3
task-level      1
tasks   1
tasks.  1
test    2
than    5
that    8
the     69
then    1
there   1
therefore       1
these   3
this    7
three   2
to      38
tolerate        2
towards 1
traditionally   1
transitive      2
two     3
type    2
types   1
under   3
up      3
updated 2
upgrade 1
usability       1
use     2
used    3
user-defined    1
users   4
using   2
utilization.    1
v.1.x.  1
v.2     4
v.2.    1
variable        1
version 2
version="1.0"?> 1
versions        2
via     1
view    1
waiting 1
way     1
we      1
well    1
were    1
when    1
whereby 1
which   6
will    3
with    10
within  1
would   1
write   1
writing,        1
you     3

最后更新: 2022-02-20 08:44:07